aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Add-visibility-protected-attribute-for-global-variab.patch77
-rw-r--r--Android.bp777
-rw-r--r--Android.bp.in9
-rw-r--r--CleanSpec.mk53
-rw-r--r--README.android8
-rw-r--r--README.version6
-rw-r--r--config/arm-neon/vp8_rtcd.h1
-rw-r--r--config/arm-neon/vp9_rtcd.h57
-rw-r--r--config/arm-neon/vpx_config.asm5
-rw-r--r--config/arm-neon/vpx_config.c2
-rw-r--r--config/arm-neon/vpx_config.h5
-rw-r--r--config/arm-neon/vpx_dsp_rtcd.h1136
-rw-r--r--config/arm-neon/vpx_scale_rtcd.h4
-rw-r--r--config/arm-neon/vpx_version.h9
-rw-r--r--config/arm/vp8_rtcd.h1
-rw-r--r--config/arm/vp9_rtcd.h42
-rw-r--r--config/arm/vpx_config.asm5
-rw-r--r--config/arm/vpx_config.c2
-rw-r--r--config/arm/vpx_config.h5
-rw-r--r--config/arm/vpx_dsp_rtcd.h882
-rw-r--r--config/arm/vpx_scale_rtcd.h4
-rw-r--r--config/arm/vpx_version.h9
-rw-r--r--config/arm64/vp8_rtcd.h1
-rw-r--r--config/arm64/vp9_rtcd.h57
-rw-r--r--config/arm64/vpx_config.asm5
-rw-r--r--config/arm64/vpx_config.c2
-rw-r--r--config/arm64/vpx_config.h5
-rw-r--r--config/arm64/vpx_dsp_rtcd.h1136
-rw-r--r--config/arm64/vpx_scale_rtcd.h4
-rw-r--r--config/arm64/vpx_version.h9
-rw-r--r--config/generic/vp8_rtcd.h1
-rw-r--r--config/generic/vp9_rtcd.h42
-rw-r--r--config/generic/vpx_config.asm5
-rw-r--r--config/generic/vpx_config.c2
-rw-r--r--config/generic/vpx_config.h5
-rw-r--r--config/generic/vpx_dsp_rtcd.h882
-rw-r--r--config/generic/vpx_scale_rtcd.h4
-rw-r--r--config/generic/vpx_version.h9
-rw-r--r--config/mips32-dspr2/vp8_rtcd.h1
-rw-r--r--config/mips32-dspr2/vp9_rtcd.h51
-rw-r--r--config/mips32-dspr2/vpx_config.c2
-rw-r--r--config/mips32-dspr2/vpx_config.h5
-rw-r--r--config/mips32-dspr2/vpx_dsp_rtcd.h937
-rw-r--r--config/mips32-dspr2/vpx_scale_rtcd.h4
-rw-r--r--config/mips32-dspr2/vpx_version.h9
-rw-r--r--config/mips32-msa/vp8_rtcd.h1
-rw-r--r--config/mips32-msa/vp9_rtcd.h66
-rw-r--r--config/mips32-msa/vpx_config.c2
-rw-r--r--config/mips32-msa/vpx_config.h5
-rw-r--r--config/mips32-msa/vpx_dsp_rtcd.h982
-rw-r--r--config/mips32-msa/vpx_scale_rtcd.h4
-rw-r--r--config/mips32-msa/vpx_version.h9
-rw-r--r--config/mips32/vp8_rtcd.h1
-rw-r--r--config/mips32/vp9_rtcd.h42
-rw-r--r--config/mips32/vpx_config.c2
-rw-r--r--config/mips32/vpx_config.h5
-rw-r--r--config/mips32/vpx_dsp_rtcd.h882
-rw-r--r--config/mips32/vpx_scale_rtcd.h4
-rw-r--r--config/mips32/vpx_version.h9
-rw-r--r--config/mips64-msa/vp8_rtcd.h1
-rw-r--r--config/mips64-msa/vp9_rtcd.h66
-rw-r--r--config/mips64-msa/vpx_config.c2
-rw-r--r--config/mips64-msa/vpx_config.h5
-rw-r--r--config/mips64-msa/vpx_dsp_rtcd.h982
-rw-r--r--config/mips64-msa/vpx_scale_rtcd.h4
-rw-r--r--config/mips64-msa/vpx_version.h9
-rw-r--r--config/mips64/vp8_rtcd.h1
-rw-r--r--config/mips64/vp9_rtcd.h42
-rw-r--r--config/mips64/vpx_config.c2
-rw-r--r--config/mips64/vpx_config.h5
-rw-r--r--config/mips64/vpx_dsp_rtcd.h882
-rw-r--r--config/mips64/vpx_scale_rtcd.h4
-rw-r--r--config/mips64/vpx_version.h9
-rw-r--r--config/x86/vp8_rtcd.h1
-rw-r--r--config/x86/vp9_rtcd.h47
-rw-r--r--config/x86/vpx_config.asm5
-rw-r--r--config/x86/vpx_config.c2
-rw-r--r--config/x86/vpx_config.h5
-rw-r--r--config/x86/vpx_dsp_rtcd.h1159
-rw-r--r--config/x86/vpx_scale_rtcd.h4
-rw-r--r--config/x86/vpx_version.h9
-rw-r--r--config/x86_64/vp8_rtcd.h1
-rw-r--r--config/x86_64/vp9_rtcd.h47
-rw-r--r--config/x86_64/vpx_config.asm5
-rw-r--r--config/x86_64/vpx_config.c2
-rw-r--r--config/x86_64/vpx_config.h5
-rw-r--r--config/x86_64/vpx_dsp_rtcd.h1164
-rw-r--r--config/x86_64/vpx_scale_rtcd.h4
-rw-r--r--config/x86_64/vpx_version.h9
-rwxr-xr-xgenerate_config.sh173
-rw-r--r--libvpx/.clang-format5
-rw-r--r--libvpx/.mailmap4
-rw-r--r--libvpx/AUTHORS17
-rw-r--r--libvpx/CHANGELOG25
-rw-r--r--libvpx/README4
-rw-r--r--libvpx/build/make/Makefile2
-rwxr-xr-xlibvpx/build/make/configure.sh55
-rwxr-xr-xlibvpx/build/make/gen_msvs_sln.sh4
-rwxr-xr-xlibvpx/build/make/rtcd.pl18
-rwxr-xr-xlibvpx/build/make/version.sh1
-rwxr-xr-xlibvpx/configure18
-rw-r--r--libvpx/examples/vp8_multi_resolution_encoder.c37
-rw-r--r--libvpx/examples/vp9_spatial_svc_encoder.c23
-rw-r--r--libvpx/examples/vpx_temporal_svc_encoder.c94
-rw-r--r--libvpx/libs.mk33
-rw-r--r--libvpx/test/acm_random.h11
-rw-r--r--libvpx/test/android/Android.mk1
-rw-r--r--libvpx/test/avg_test.cc47
-rw-r--r--libvpx/test/buffer.h170
-rw-r--r--libvpx/test/byte_alignment_test.cc4
-rw-r--r--libvpx/test/comp_avg_pred_test.cc93
-rw-r--r--libvpx/test/convolve_test.cc272
-rw-r--r--libvpx/test/datarate_test.cc489
-rw-r--r--libvpx/test/dct16x16_test.cc96
-rw-r--r--libvpx/test/dct32x32_test.cc97
-rw-r--r--libvpx/test/dct_partial_test.cc169
-rw-r--r--libvpx/test/dct_test.cc737
-rw-r--r--libvpx/test/decode_test_driver.cc8
-rw-r--r--libvpx/test/encode_api_test.cc113
-rw-r--r--libvpx/test/encode_test_driver.cc2
-rw-r--r--libvpx/test/encode_test_driver.h9
-rw-r--r--libvpx/test/external_frame_buffer_test.cc48
-rw-r--r--libvpx/test/fdct4x4_test.cc511
-rw-r--r--libvpx/test/fdct8x8_test.cc13
-rw-r--r--libvpx/test/hadamard_test.cc8
-rw-r--r--libvpx/test/idct_test.cc14
-rw-r--r--libvpx/test/invalid_file_test.cc20
-rw-r--r--libvpx/test/ivf_video_source.h4
-rw-r--r--libvpx/test/keyframe_test.cc4
-rw-r--r--libvpx/test/level_test.cc12
-rw-r--r--libvpx/test/lpf_test.cc48
-rw-r--r--libvpx/test/minmax_test.cc8
-rw-r--r--libvpx/test/partial_idct_test.cc217
-rw-r--r--libvpx/test/pp_filter_test.cc28
-rw-r--r--libvpx/test/predict_test.cc9
-rw-r--r--libvpx/test/quantize_test.cc8
-rw-r--r--libvpx/test/register_state_check.h4
-rw-r--r--libvpx/test/resize_test.cc16
-rw-r--r--libvpx/test/sad_test.cc118
-rw-r--r--libvpx/test/set_roi.cc8
-rw-r--r--libvpx/test/temporal_filter_test.cc45
-rw-r--r--libvpx/test/test-data.mk6
-rw-r--r--libvpx/test/test-data.sha16
-rw-r--r--libvpx/test/test.mk8
-rw-r--r--libvpx/test/test_intra_pred_speed.cc71
-rw-r--r--libvpx/test/test_libvpx.cc3
-rw-r--r--libvpx/test/test_vector_test.cc46
-rw-r--r--libvpx/test/test_vectors.cc1
-rwxr-xr-xlibvpx/test/twopass_encoder.sh9
-rw-r--r--libvpx/test/variance_test.cc949
-rw-r--r--libvpx/test/vp8_fdct4x4_test.cc4
-rw-r--r--libvpx/test/vp9_encoder_parms_get_to_decoder.cc4
-rw-r--r--libvpx/test/vp9_ethread_test.cc14
-rw-r--r--libvpx/test/vp9_frame_parallel_test.cc217
-rw-r--r--libvpx/test/vp9_intrapred_test.cc254
-rw-r--r--libvpx/test/vp9_quantize_test.cc690
-rw-r--r--libvpx/test/vp9_scale_test.cc214
-rw-r--r--libvpx/test/vp9_skip_loopfilter_test.cc4
-rw-r--r--libvpx/test/vp9_subtract_test.cc5
-rw-r--r--libvpx/test/vp9_thread_test.cc4
-rw-r--r--libvpx/test/vpx_scale_test.cc155
-rw-r--r--libvpx/test/vpx_scale_test.h200
-rwxr-xr-xlibvpx/test/vpx_temporal_svc_encoder.sh18
-rwxr-xr-xlibvpx/test/vpxenc.sh31
-rw-r--r--libvpx/test/webm_video_source.h4
-rw-r--r--libvpx/test/y4m_video_source.h4
-rw-r--r--libvpx/test/yuv_video_source.h4
-rw-r--r--libvpx/third_party/googletest/README.libvpx2
-rw-r--r--libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h13
-rw-r--r--libvpx/third_party/googletest/src/src/gtest.cc1
-rw-r--r--libvpx/third_party/libwebm/README.libvpx2
-rw-r--r--libvpx/third_party/libwebm/common/hdr_util.h8
-rw-r--r--libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc46
-rw-r--r--libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc3
-rw-r--r--libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc2
-rw-r--r--libvpx/third_party/libwebm/mkvparser/mkvparser.cc62
-rw-r--r--libvpx/third_party/libwebm/mkvparser/mkvreader.cc2
-rw-r--r--libvpx/tools.mk2
-rwxr-xr-xlibvpx/tools/all_builds.py72
-rwxr-xr-xlibvpx/tools/author_first_release.sh15
-rwxr-xr-xlibvpx/tools/ftfy.sh158
-rw-r--r--libvpx/tools/tiny_ssim.c577
-rw-r--r--libvpx/vp8/common/blockd.h5
-rw-r--r--libvpx/vp8/common/loopfilter_filters.c102
-rw-r--r--libvpx/vp8/common/mfqe.c15
-rw-r--r--libvpx/vp8/common/mips/dspr2/filter_dspr2.c14
-rw-r--r--libvpx/vp8/common/mips/mmi/copymem_mmi.c114
-rw-r--r--libvpx/vp8/common/mips/mmi/dequantize_mmi.c115
-rw-r--r--libvpx/vp8/common/mips/mmi/idct_blk_mmi.c71
-rw-r--r--libvpx/vp8/common/mips/mmi/idctllm_mmi.c328
-rw-r--r--libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c1337
-rw-r--r--libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c416
-rw-r--r--libvpx/vp8/common/onyxd.h2
-rw-r--r--libvpx/vp8/common/reconintra.c8
-rw-r--r--libvpx/vp8/common/reconintra4x4.c12
-rw-r--r--libvpx/vp8/common/rtcd_defs.pl68
-rw-r--r--libvpx/vp8/common/threading.h39
-rw-r--r--libvpx/vp8/common/vp8_loopfilter.c7
-rw-r--r--libvpx/vp8/common/vp8_skin_detection.c109
-rw-r--r--libvpx/vp8/common/vp8_skin_detection.h47
-rw-r--r--libvpx/vp8/common/x86/copy_sse2.asm1
-rw-r--r--libvpx/vp8/common/x86/copy_sse3.asm1
-rw-r--r--libvpx/vp8/common/x86/dequantize_mmx.asm1
-rw-r--r--libvpx/vp8/common/x86/idctllm_mmx.asm1
-rw-r--r--libvpx/vp8/common/x86/idctllm_sse2.asm2
-rw-r--r--libvpx/vp8/common/x86/iwalsh_sse2.asm2
-rw-r--r--libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm2
-rw-r--r--libvpx/vp8/common/x86/loopfilter_sse2.asm2
-rw-r--r--libvpx/vp8/common/x86/mfqe_sse2.asm2
-rw-r--r--libvpx/vp8/common/x86/recon_mmx.asm1
-rw-r--r--libvpx/vp8/common/x86/recon_sse2.asm2
-rw-r--r--libvpx/vp8/common/x86/subpixel_mmx.asm1
-rw-r--r--libvpx/vp8/common/x86/subpixel_sse2.asm1
-rw-r--r--libvpx/vp8/common/x86/subpixel_ssse3.asm1
-rw-r--r--libvpx/vp8/decoder/decodeframe.c15
-rw-r--r--libvpx/vp8/decoder/decodemv.c7
-rw-r--r--libvpx/vp8/decoder/onyxd_if.c1
-rw-r--r--libvpx/vp8/decoder/onyxd_int.h9
-rw-r--r--libvpx/vp8/decoder/threading.c84
-rw-r--r--libvpx/vp8/encoder/bitstream.c5
-rw-r--r--libvpx/vp8/encoder/bitstream.h8
-rw-r--r--libvpx/vp8/encoder/encodeframe.c37
-rw-r--r--libvpx/vp8/encoder/encodeframe.h25
-rw-r--r--libvpx/vp8/encoder/encodemv.c6
-rw-r--r--libvpx/vp8/encoder/ethreading.c40
-rw-r--r--libvpx/vp8/encoder/ethreading.h32
-rw-r--r--libvpx/vp8/encoder/firstpass.c5
-rw-r--r--libvpx/vp8/encoder/mcomp.c17
-rw-r--r--libvpx/vp8/encoder/mips/mmi/dct_mmi.c425
-rw-r--r--libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c262
-rw-r--r--libvpx/vp8/encoder/onyx_if.c219
-rw-r--r--libvpx/vp8/encoder/onyx_int.h16
-rw-r--r--libvpx/vp8/encoder/pickinter.c99
-rw-r--r--libvpx/vp8/encoder/picklpf.c1
-rw-r--r--libvpx/vp8/encoder/picklpf.h30
-rw-r--r--libvpx/vp8/encoder/ratectrl.c84
-rw-r--r--libvpx/vp8/encoder/rdopt.c23
-rw-r--r--libvpx/vp8/encoder/rdopt.h3
-rw-r--r--libvpx/vp8/encoder/temporal_filter.c1
-rw-r--r--libvpx/vp8/encoder/temporal_filter.h26
-rw-r--r--libvpx/vp8/encoder/x86/dct_sse2.asm2
-rw-r--r--libvpx/vp8/encoder/x86/encodeopt.asm2
-rw-r--r--libvpx/vp8/encoder/x86/fwalsh_sse2.asm2
-rw-r--r--libvpx/vp8/encoder/x86/quantize_mmx.asm286
-rw-r--r--libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm4
-rw-r--r--libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c34
-rw-r--r--libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c (renamed from libvpx/vp8/encoder/x86/quantize_ssse3.c)1
-rw-r--r--libvpx/vp8/vp8_common.mk8
-rw-r--r--libvpx/vp8/vp8_cx_iface.c1
-rw-r--r--libvpx/vp8/vp8_dx_iface.c11
-rw-r--r--libvpx/vp8/vp8cx.mk13
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c9
-rw-r--r--libvpx/vp9/common/vp9_alloccommon.c34
-rw-r--r--libvpx/vp9/common/vp9_entropymode.c4
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.c8
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.h4
-rw-r--r--libvpx/vp9/common/vp9_onyxc_int.h40
-rw-r--r--libvpx/vp9/common/vp9_postproc.c6
-rw-r--r--libvpx/vp9/common/vp9_reconinter.h10
-rw-r--r--libvpx/vp9/common/vp9_rtcd_defs.pl147
-rw-r--r--libvpx/vp9/common/vp9_thread_common.c7
-rw-r--r--libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c114
-rw-r--r--libvpx/vp9/common/x86/vp9_mfqe_sse2.asm2
-rw-r--r--libvpx/vp9/decoder/vp9_decodeframe.c82
-rw-r--r--libvpx/vp9/decoder/vp9_decodemv.c31
-rw-r--r--libvpx/vp9/decoder/vp9_decoder.c63
-rw-r--r--libvpx/vp9/decoder/vp9_decoder.h7
-rw-r--r--libvpx/vp9/decoder/vp9_dthread.c190
-rw-r--r--libvpx/vp9/decoder/vp9_dthread.h74
-rw-r--r--libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c843
-rw-r--r--libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c273
-rw-r--r--libvpx/vp9/encoder/vp9_alt_ref_aq.c2
-rw-r--r--libvpx/vp9/encoder/vp9_alt_ref_aq.h2
-rw-r--r--libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c3
-rw-r--r--libvpx/vp9/encoder/vp9_bitstream.c6
-rw-r--r--libvpx/vp9/encoder/vp9_block.h20
-rw-r--r--libvpx/vp9/encoder/vp9_context_tree.h1
-rw-r--r--libvpx/vp9/encoder/vp9_denoiser.c261
-rw-r--r--libvpx/vp9/encoder/vp9_denoiser.h28
-rw-r--r--libvpx/vp9/encoder/vp9_encodeframe.c420
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.c465
-rw-r--r--libvpx/vp9/encoder/vp9_encoder.c684
-rw-r--r--libvpx/vp9/encoder/vp9_encoder.h48
-rw-r--r--libvpx/vp9/encoder/vp9_ethread.c18
-rw-r--r--libvpx/vp9/encoder/vp9_firstpass.c338
-rw-r--r--libvpx/vp9/encoder/vp9_frame_scale.c97
-rw-r--r--libvpx/vp9/encoder/vp9_mcomp.c206
-rw-r--r--libvpx/vp9/encoder/vp9_noise_estimate.c91
-rw-r--r--libvpx/vp9/encoder/vp9_pickmode.c313
-rw-r--r--libvpx/vp9/encoder/vp9_quantize.c156
-rw-r--r--libvpx/vp9/encoder/vp9_ratectrl.c173
-rw-r--r--libvpx/vp9/encoder/vp9_ratectrl.h7
-rw-r--r--libvpx/vp9/encoder/vp9_rd.c32
-rw-r--r--libvpx/vp9/encoder/vp9_rd.h9
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.c108
-rw-r--r--libvpx/vp9/encoder/vp9_skin_detection.c231
-rw-r--r--libvpx/vp9/encoder/vp9_skin_detection.h13
-rw-r--r--libvpx/vp9/encoder/vp9_speed_features.c121
-rw-r--r--libvpx/vp9/encoder/vp9_speed_features.h18
-rw-r--r--libvpx/vp9/encoder/vp9_svc_layercontext.c50
-rw-r--r--libvpx/vp9/encoder/vp9_svc_layercontext.h14
-rw-r--r--libvpx/vp9/encoder/vp9_temporal_filter.c21
-rw-r--r--libvpx/vp9/encoder/x86/temporal_filter_sse4.c1
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c92
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_ssse3.c2
-rw-r--r--libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c1
-rw-r--r--libvpx/vp9/encoder/x86/vp9_error_avx2.c56
-rw-r--r--libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c1025
-rw-r--r--libvpx/vp9/encoder/x86/vp9_quantize_sse2.c277
-rw-r--r--libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm25
-rw-r--r--libvpx/vp9/vp9_cx_iface.c53
-rw-r--r--libvpx/vp9/vp9_dx_iface.c594
-rw-r--r--libvpx/vp9/vp9_dx_iface.h25
-rw-r--r--libvpx/vp9/vp9cx.mk7
-rw-r--r--libvpx/vp9/vp9dx.mk2
-rw-r--r--libvpx/vpx/src/svc_encodeframe.c10
-rw-r--r--libvpx/vpx/src/vpx_encoder.c52
-rw-r--r--libvpx/vpx/vp8cx.h17
-rw-r--r--libvpx/vpx/vp8dx.h2
-rw-r--r--libvpx/vpx/vpx_codec.h49
-rw-r--r--libvpx/vpx/vpx_encoder.h35
-rw-r--r--libvpx/vpx_dsp/add_noise.c1
-rw-r--r--libvpx/vpx_dsp/arm/avg_neon.c69
-rw-r--r--libvpx/vpx_dsp/arm/avg_pred_neon.c55
-rw-r--r--libvpx/vpx_dsp/arm/fdct16x16_neon.c387
-rw-r--r--libvpx/vpx_dsp/arm/fdct32x32_neon.c1507
-rw-r--r--libvpx/vpx_dsp/arm/fdct_neon.c14
-rw-r--r--libvpx/vpx_dsp/arm/fdct_partial_neon.c113
-rw-r--r--libvpx/vpx_dsp/arm/fwd_txfm_neon.c85
-rw-r--r--libvpx/vpx_dsp/arm/hadamard_neon.c4
-rw-r--r--libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c8
-rw-r--r--libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c8
-rw-r--r--libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c8
-rw-r--r--libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c8
-rw-r--r--libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c224
-rw-r--r--libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c13
-rw-r--r--libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c13
-rw-r--r--libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c27
-rw-r--r--libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c3
-rw-r--r--libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c3
-rw-r--r--libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c3
-rw-r--r--libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c3
-rw-r--r--libvpx/vpx_dsp/arm/idct_neon.h4
-rw-r--r--libvpx/vpx_dsp/arm/mem_neon.h26
-rw-r--r--libvpx/vpx_dsp/arm/quantize_neon.c296
-rw-r--r--libvpx/vpx_dsp/arm/sad4d_neon.c394
-rw-r--r--libvpx/vpx_dsp/arm/sad_neon.c467
-rw-r--r--libvpx/vpx_dsp/arm/subpel_variance_neon.c133
-rw-r--r--libvpx/vpx_dsp/arm/sum_neon.h47
-rw-r--r--libvpx/vpx_dsp/arm/variance_neon.c34
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm31
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve8_neon.c273
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve8_neon.h133
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm31
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c14
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm2
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c13
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm2
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve_neon.c28
-rw-r--r--libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c324
-rw-r--r--libvpx/vpx_dsp/avg.c6
-rw-r--r--libvpx/vpx_dsp/deblock.c1
-rw-r--r--libvpx/vpx_dsp/fwd_txfm.c2
-rw-r--r--libvpx/vpx_dsp/intrapred.c37
-rw-r--r--libvpx/vpx_dsp/inv_txfm.c575
-rw-r--r--libvpx/vpx_dsp/mips/avg_msa.c6
-rw-r--r--libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c11
-rw-r--r--libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c9
-rw-r--r--libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c11
-rw-r--r--libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c11
-rw-r--r--libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c41
-rw-r--r--libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c14
-rw-r--r--libvpx/vpx_dsp/mips/convolve8_dspr2.c21
-rw-r--r--libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c13
-rw-r--r--libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c13
-rw-r--r--libvpx/vpx_dsp/mips/convolve_common_dspr2.h22
-rw-r--r--libvpx/vpx_dsp/mips/fwd_txfm_msa.c29
-rw-r--r--libvpx/vpx_dsp/mips/itrans4_dspr2.c1
-rw-r--r--libvpx/vpx_dsp/mips/loopfilter_16_msa.c64
-rw-r--r--libvpx/vpx_dsp/mips/loopfilter_4_msa.c1
-rw-r--r--libvpx/vpx_dsp/mips/loopfilter_8_msa.c1
-rw-r--r--libvpx/vpx_dsp/mips/macros_msa.h511
-rw-r--r--libvpx/vpx_dsp/mips/sad_mmi.c805
-rw-r--r--libvpx/vpx_dsp/mips/sad_msa.c304
-rw-r--r--libvpx/vpx_dsp/mips/subtract_mmi.c306
-rw-r--r--libvpx/vpx_dsp/mips/variance_mmi.c1280
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c121
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c97
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c80
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c13
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve8_msa.c614
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c13
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c14
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c14
-rw-r--r--libvpx/vpx_dsp/mips/vpx_convolve_msa.h17
-rw-r--r--libvpx/vpx_dsp/ppc/hadamard_vsx.c4
-rw-r--r--libvpx/vpx_dsp/ppc/inv_txfm_vsx.c1063
-rw-r--r--libvpx/vpx_dsp/ppc/sad_vsx.c152
-rw-r--r--libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c114
-rw-r--r--libvpx/vpx_dsp/quantize.c221
-rw-r--r--libvpx/vpx_dsp/sad.c33
-rw-r--r--libvpx/vpx_dsp/skin_detection.c79
-rw-r--r--libvpx/vpx_dsp/skin_detection.h24
-rw-r--r--libvpx/vpx_dsp/txfm_common.h70
-rw-r--r--libvpx/vpx_dsp/variance.c19
-rw-r--r--libvpx/vpx_dsp/variance.h2
-rw-r--r--libvpx/vpx_dsp/vpx_convolve.c329
-rw-r--r--libvpx/vpx_dsp/vpx_convolve.h8
-rw-r--r--libvpx/vpx_dsp/vpx_dsp.mk41
-rw-r--r--libvpx/vpx_dsp/vpx_dsp_common.h4
-rw-r--r--libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl453
-rw-r--r--libvpx/vpx_dsp/vpx_filter.h11
-rw-r--r--libvpx/vpx_dsp/x86/add_noise_sse2.asm6
-rw-r--r--libvpx/vpx_dsp/x86/avg_intrin_avx2.c197
-rw-r--r--libvpx/vpx_dsp/x86/avg_intrin_sse2.c4
-rw-r--r--libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h16
-rw-r--r--libvpx/vpx_dsp/x86/convolve.h239
-rw-r--r--libvpx/vpx_dsp/x86/convolve_avx2.h105
-rw-r--r--libvpx/vpx_dsp/x86/convolve_ssse3.h112
-rw-r--r--libvpx/vpx_dsp/x86/deblock_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h2
-rw-r--r--libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h2
-rw-r--r--libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h4
-rw-r--r--libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm60
-rw-r--r--libvpx/vpx_dsp/x86/highbd_convolve_avx2.c34
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c498
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c349
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c762
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c765
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c234
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c69
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c366
-rw-r--r--libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c210
-rw-r--r--libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c533
-rw-r--r--libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c930
-rw-r--r--libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h390
-rw-r--r--libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h87
-rw-r--r--libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c1
-rw-r--r--libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c149
-rw-r--r--libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/intrapred_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/inv_txfm_sse2.c2874
-rw-r--r--libvpx/vpx_dsp/x86/inv_txfm_sse2.h874
-rw-r--r--libvpx/vpx_dsp/x86/inv_txfm_ssse3.c1541
-rw-r--r--libvpx/vpx_dsp/x86/inv_txfm_ssse3.h110
-rw-r--r--libvpx/vpx_dsp/x86/mem_sse2.h124
-rw-r--r--libvpx/vpx_dsp/x86/quantize_avx.c315
-rw-r--r--libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm544
-rw-r--r--libvpx/vpx_dsp/x86/quantize_sse2.c293
-rw-r--r--libvpx/vpx_dsp/x86/quantize_ssse3.c292
-rw-r--r--libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm345
-rw-r--r--libvpx/vpx_dsp/x86/quantize_x86.h78
-rw-r--r--libvpx/vpx_dsp/x86/sad4d_avx512.c83
-rw-r--r--libvpx/vpx_dsp/x86/sad_sse3.asm2
-rw-r--r--libvpx/vpx_dsp/x86/sad_sse4.asm2
-rw-r--r--libvpx/vpx_dsp/x86/sad_ssse3.asm2
-rw-r--r--libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm3
-rw-r--r--libvpx/vpx_dsp/x86/transpose_sse2.h372
-rw-r--r--libvpx/vpx_dsp/x86/txfm_common_sse2.h3
-rw-r--r--libvpx/vpx_dsp/x86/variance_avx2.c618
-rw-r--r--libvpx/vpx_dsp/x86/variance_impl_avx2.c708
-rw-r--r--libvpx/vpx_dsp/x86/variance_sse2.c2
-rw-r--r--libvpx/vpx_dsp/x86/vpx_asm_stubs.c52
-rw-r--r--libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm8
-rw-r--r--libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c674
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c777
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm24
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm2
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm2
-rw-r--r--libvpx/vpx_mem/vpx_mem.c9
-rw-r--r--libvpx/vpx_mem/vpx_mem.h9
-rw-r--r--libvpx/vpx_ports/asmdefs_mmi.h81
-rw-r--r--libvpx/vpx_ports/vpx_ports.mk4
-rw-r--r--libvpx/vpx_ports/x86.h25
-rw-r--r--libvpx/vpx_scale/generic/yv12config.c9
-rw-r--r--libvpx/vpx_scale/generic/yv12extend.c67
-rw-r--r--libvpx/vpx_scale/vpx_scale_rtcd.pl12
-rw-r--r--libvpx/vpx_util/vpx_atomics.h109
-rw-r--r--libvpx/vpx_util/vpx_util.mk3
-rw-r--r--libvpx/vpx_util/vpx_write_yuv_frame.c46
-rw-r--r--libvpx/vpx_util/vpx_write_yuv_frame.h27
-rw-r--r--libvpx/vpxdec.c56
-rw-r--r--libvpx/vpxenc.c97
-rw-r--r--libvpx/y4minput.c154
-rw-r--r--libwebm/Android.bp1
489 files changed, 44853 insertions, 18771 deletions
diff --git a/Add-visibility-protected-attribute-for-global-variab.patch b/Add-visibility-protected-attribute-for-global-variab.patch
new file mode 100644
index 000000000..f15a73caf
--- /dev/null
+++ b/Add-visibility-protected-attribute-for-global-variab.patch
@@ -0,0 +1,77 @@
+From 0d88e15454b632d92404dd6a7181c58d9985e2a2 Mon Sep 17 00:00:00 2001
+From: Rahul Chaudhry <rahulchaudhry@google.com>
+Date: Tue, 9 May 2017 12:00:58 -0700
+Subject: [PATCH] Add visibility="protected" attribute for global variables
+ referenced in asm files.
+
+During aosp builds with binutils-2.27, we're seeing linker error
+messages of this form:
+libvpx.a(subpixel_mmx.o): relocation R_386_GOTOFF against preemptible
+symbol vp8_bilinear_filters_x86_8 cannot be used when making a shared
+object
+
+subpixel_mmx.o is assembled from "vp8/common/x86/subpixel_mmx.asm".
+Other messages refer to symbol references from deblock_sse2.o and
+subpixel_sse2.o, also assembled from asm files.
+
+This change marks such symbols as having "protected" visibility. This
+satisfies the linker as the symbols are not preemptible from outside
+the shared library now, which I think is the original intent anyway.
+
+Change-Id: I2817f7a5f43041533d65ebf41aefd63f8581a452
+---
+ vp8/common/x86/filter_x86.c | 3 ++-
+ vpx_dsp/deblock.c | 4 ++--
+ vpx_ports/mem.h | 6 ++++++
+ 3 files changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c
+index 2405342f0..73435a7dd 100644
+--- a/vp8/common/x86/filter_x86.c
++++ b/vp8/common/x86/filter_x86.c
+@@ -17,7 +17,8 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {
+ { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
+ };
+
+-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
++DECLARE_PROTECTED(DECLARE_ALIGNED(16, const short,
++ vp8_bilinear_filters_x86_8[8][16])) = {
+ { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
+index a0db1e40c..3734ac251 100644
+--- a/vpx_dsp/deblock.c
++++ b/vpx_dsp/deblock.c
+@@ -10,9 +10,9 @@
+ #include <assert.h>
+ #include <stdlib.h>
+ #include "./vpx_dsp_rtcd.h"
+-#include "vpx/vpx_integer.h"
++#include "vpx_ports/mem.h"
+
+-const int16_t vpx_rv[] = {
++DECLARE_PROTECTED(const int16_t vpx_rv[]) = {
+ 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14,
+ 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0,
+ 3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8,
+diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
+index bfef783b1..35751cef8 100644
+--- a/vpx_ports/mem.h
++++ b/vpx_ports/mem.h
+@@ -23,6 +23,12 @@
+ #define DECLARE_ALIGNED(n, typ, val) typ val
+ #endif
+
++#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
++#define DECLARE_PROTECTED(decl) decl __attribute__((visibility("protected")))
++#else
++#define DECLARE_PROTECTED(decl) decl
++#endif
++
+ #if HAVE_NEON && defined(_MSC_VER)
+ #define __builtin_prefetch(x)
+ #endif
+--
+2.15.1
+
diff --git a/Android.bp b/Android.bp
index ad834ebcb..432246f14 100644
--- a/Android.bp
+++ b/Android.bp
@@ -2,7 +2,6 @@
// Generated from Android.bp.in, run ./generate_config.sh to regenerate
libvpx_arm_neon_c_srcs = [
- "libvpx/vp8/common/alloccommon.c",
"libvpx/vp8/common/arm/loopfilter_arm.c",
"libvpx/vp8/common/arm/neon/bilinearpredict_neon.c",
"libvpx/vp8/common/arm/neon/copymem_neon.c",
@@ -19,142 +18,34 @@ libvpx_arm_neon_c_srcs = [
"libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c",
"libvpx/vp8/common/arm/neon/sixtappredict_neon.c",
"libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c",
- "libvpx/vp8/common/blockd.c",
- "libvpx/vp8/common/copy_c.c",
- "libvpx/vp8/common/dequantize.c",
- "libvpx/vp8/common/entropy.c",
- "libvpx/vp8/common/entropymode.c",
- "libvpx/vp8/common/entropymv.c",
- "libvpx/vp8/common/extend.c",
- "libvpx/vp8/common/filter.c",
- "libvpx/vp8/common/findnearmv.c",
- "libvpx/vp8/common/generic/systemdependent.c",
- "libvpx/vp8/common/idct_blk.c",
- "libvpx/vp8/common/idctllm.c",
- "libvpx/vp8/common/loopfilter_filters.c",
- "libvpx/vp8/common/mbpitch.c",
- "libvpx/vp8/common/modecont.c",
- "libvpx/vp8/common/quant_common.c",
- "libvpx/vp8/common/reconinter.c",
- "libvpx/vp8/common/reconintra.c",
- "libvpx/vp8/common/reconintra4x4.c",
- "libvpx/vp8/common/rtcd.c",
- "libvpx/vp8/common/setupintrarecon.c",
- "libvpx/vp8/common/swapyv12buffer.c",
- "libvpx/vp8/common/treecoder.c",
- "libvpx/vp8/common/vp8_loopfilter.c",
- "libvpx/vp8/decoder/dboolhuff.c",
- "libvpx/vp8/decoder/decodeframe.c",
- "libvpx/vp8/decoder/decodemv.c",
- "libvpx/vp8/decoder/detokenize.c",
- "libvpx/vp8/decoder/onyxd_if.c",
- "libvpx/vp8/decoder/threading.c",
"libvpx/vp8/encoder/arm/neon/denoising_neon.c",
"libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c",
"libvpx/vp8/encoder/arm/neon/shortfdct_neon.c",
"libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c",
- "libvpx/vp8/encoder/bitstream.c",
- "libvpx/vp8/encoder/boolhuff.c",
- "libvpx/vp8/encoder/dct.c",
- "libvpx/vp8/encoder/denoising.c",
- "libvpx/vp8/encoder/encodeframe.c",
- "libvpx/vp8/encoder/encodeintra.c",
- "libvpx/vp8/encoder/encodemb.c",
- "libvpx/vp8/encoder/encodemv.c",
- "libvpx/vp8/encoder/ethreading.c",
- "libvpx/vp8/encoder/lookahead.c",
- "libvpx/vp8/encoder/mcomp.c",
- "libvpx/vp8/encoder/modecosts.c",
- "libvpx/vp8/encoder/onyx_if.c",
- "libvpx/vp8/encoder/pickinter.c",
- "libvpx/vp8/encoder/picklpf.c",
- "libvpx/vp8/encoder/ratectrl.c",
- "libvpx/vp8/encoder/rdopt.c",
- "libvpx/vp8/encoder/segmentation.c",
- "libvpx/vp8/encoder/tokenize.c",
- "libvpx/vp8/encoder/treewriter.c",
- "libvpx/vp8/encoder/vp8_quantize.c",
- "libvpx/vp8/vp8_cx_iface.c",
- "libvpx/vp8/vp8_dx_iface.c",
- "libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c",
- "libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c",
- "libvpx/vp9/common/vp9_alloccommon.c",
- "libvpx/vp9/common/vp9_blockd.c",
- "libvpx/vp9/common/vp9_common_data.c",
- "libvpx/vp9/common/vp9_entropy.c",
- "libvpx/vp9/common/vp9_entropymode.c",
- "libvpx/vp9/common/vp9_entropymv.c",
- "libvpx/vp9/common/vp9_filter.c",
- "libvpx/vp9/common/vp9_frame_buffers.c",
- "libvpx/vp9/common/vp9_idct.c",
- "libvpx/vp9/common/vp9_loopfilter.c",
- "libvpx/vp9/common/vp9_mvref_common.c",
- "libvpx/vp9/common/vp9_pred_common.c",
- "libvpx/vp9/common/vp9_quant_common.c",
- "libvpx/vp9/common/vp9_reconinter.c",
- "libvpx/vp9/common/vp9_reconintra.c",
- "libvpx/vp9/common/vp9_rtcd.c",
- "libvpx/vp9/common/vp9_scale.c",
- "libvpx/vp9/common/vp9_scan.c",
- "libvpx/vp9/common/vp9_seg_common.c",
- "libvpx/vp9/common/vp9_thread_common.c",
- "libvpx/vp9/common/vp9_tile_common.c",
- "libvpx/vp9/decoder/vp9_decodeframe.c",
- "libvpx/vp9/decoder/vp9_decodemv.c",
- "libvpx/vp9/decoder/vp9_decoder.c",
- "libvpx/vp9/decoder/vp9_detokenize.c",
- "libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
- "libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
+ "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
"libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
- "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
- "libvpx/vp9/encoder/vp9_aq_360.c",
- "libvpx/vp9/encoder/vp9_aq_complexity.c",
- "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
- "libvpx/vp9/encoder/vp9_aq_variance.c",
- "libvpx/vp9/encoder/vp9_bitstream.c",
- "libvpx/vp9/encoder/vp9_context_tree.c",
- "libvpx/vp9/encoder/vp9_cost.c",
- "libvpx/vp9/encoder/vp9_dct.c",
- "libvpx/vp9/encoder/vp9_encodeframe.c",
- "libvpx/vp9/encoder/vp9_encodemb.c",
- "libvpx/vp9/encoder/vp9_encodemv.c",
- "libvpx/vp9/encoder/vp9_encoder.c",
- "libvpx/vp9/encoder/vp9_ethread.c",
- "libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
- "libvpx/vp9/encoder/vp9_frame_scale.c",
- "libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
- "libvpx/vp9/encoder/vp9_mcomp.c",
- "libvpx/vp9/encoder/vp9_multi_thread.c",
- "libvpx/vp9/encoder/vp9_noise_estimate.c",
- "libvpx/vp9/encoder/vp9_picklpf.c",
- "libvpx/vp9/encoder/vp9_pickmode.c",
- "libvpx/vp9/encoder/vp9_quantize.c",
- "libvpx/vp9/encoder/vp9_ratectrl.c",
- "libvpx/vp9/encoder/vp9_rd.c",
- "libvpx/vp9/encoder/vp9_rdopt.c",
- "libvpx/vp9/encoder/vp9_resize.c",
- "libvpx/vp9/encoder/vp9_segmentation.c",
- "libvpx/vp9/encoder/vp9_skin_detection.c",
- "libvpx/vp9/encoder/vp9_speed_features.c",
- "libvpx/vp9/encoder/vp9_subexp.c",
- "libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
- "libvpx/vp9/encoder/vp9_tokenize.c",
- "libvpx/vp9/encoder/vp9_treewriter.c",
- "libvpx/vp9/vp9_cx_iface.c",
- "libvpx/vp9/vp9_dx_iface.c",
- "libvpx/vpx/src/vpx_codec.c",
- "libvpx/vpx/src/vpx_decoder.c",
- "libvpx/vpx/src/vpx_encoder.c",
- "libvpx/vpx/src/vpx_image.c",
"libvpx/vpx_dsp/arm/avg_neon.c",
+ "libvpx/vpx_dsp/arm/avg_pred_neon.c",
+ "libvpx/vpx_dsp/arm/fdct16x16_neon.c",
+ "libvpx/vpx_dsp/arm/fdct32x32_neon.c",
"libvpx/vpx_dsp/arm/fdct_neon.c",
+ "libvpx/vpx_dsp/arm/fdct_partial_neon.c",
"libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"libvpx/vpx_dsp/arm/hadamard_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c",
"libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
"libvpx/vpx_dsp/arm/idct16x16_add_neon.c",
"libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
@@ -164,41 +55,21 @@ libvpx_arm_neon_c_srcs = [
"libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c",
"libvpx/vpx_dsp/arm/idct8x8_add_neon.c",
"libvpx/vpx_dsp/arm/intrapred_neon.c",
+ "libvpx/vpx_dsp/arm/quantize_neon.c",
"libvpx/vpx_dsp/arm/sad4d_neon.c",
"libvpx/vpx_dsp/arm/sad_neon.c",
"libvpx/vpx_dsp/arm/subpel_variance_neon.c",
"libvpx/vpx_dsp/arm/subtract_neon.c",
"libvpx/vpx_dsp/arm/variance_neon.c",
"libvpx/vpx_dsp/arm/vpx_convolve_neon.c",
- "libvpx/vpx_dsp/avg.c",
- "libvpx/vpx_dsp/bitreader.c",
- "libvpx/vpx_dsp/bitreader_buffer.c",
- "libvpx/vpx_dsp/bitwriter.c",
- "libvpx/vpx_dsp/bitwriter_buffer.c",
- "libvpx/vpx_dsp/fwd_txfm.c",
- "libvpx/vpx_dsp/intrapred.c",
- "libvpx/vpx_dsp/inv_txfm.c",
- "libvpx/vpx_dsp/loopfilter.c",
- "libvpx/vpx_dsp/prob.c",
- "libvpx/vpx_dsp/psnr.c",
- "libvpx/vpx_dsp/quantize.c",
- "libvpx/vpx_dsp/sad.c",
- "libvpx/vpx_dsp/subtract.c",
- "libvpx/vpx_dsp/sum_squares.c",
- "libvpx/vpx_dsp/variance.c",
- "libvpx/vpx_dsp/vpx_convolve.c",
- "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
- "libvpx/vpx_mem/vpx_mem.c",
- "libvpx/vpx_ports/arm_cpudetect.c",
- "libvpx/vpx_scale/generic/gen_scalers.c",
- "libvpx/vpx_scale/generic/vpx_scale.c",
- "libvpx/vpx_scale/generic/yv12config.c",
- "libvpx/vpx_scale/generic/yv12extend.c",
- "libvpx/vpx_scale/vpx_scale_rtcd.c",
- "libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c",
"config/arm-neon/vpx_config.c",
]
+libvpx_arm_neon_exclude_c_srcs = [
+ "config/arm/vpx_config.c",
+]
+
libvpx_arm_neon_asm_srcs = [
"libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm",
"libvpx/vpx_dsp/arm/idct4x4_add_neon.asm",
@@ -240,6 +111,7 @@ libvpx_arm_c_srcs = [
"libvpx/vp8/common/swapyv12buffer.c",
"libvpx/vp8/common/treecoder.c",
"libvpx/vp8/common/vp8_loopfilter.c",
+ "libvpx/vp8/common/vp8_skin_detection.c",
"libvpx/vp8/decoder/dboolhuff.c",
"libvpx/vp8/decoder/decodeframe.c",
"libvpx/vp8/decoder/decodemv.c",
@@ -295,7 +167,6 @@ libvpx_arm_c_srcs = [
"libvpx/vp9/decoder/vp9_decoder.c",
"libvpx/vp9/decoder/vp9_detokenize.c",
"libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/vp9_alt_ref_aq.c",
"libvpx/vp9/encoder/vp9_aq_360.c",
"libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -311,10 +182,8 @@ libvpx_arm_c_srcs = [
"libvpx/vp9/encoder/vp9_encoder.c",
"libvpx/vp9/encoder/vp9_ethread.c",
"libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
"libvpx/vp9/encoder/vp9_frame_scale.c",
"libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
"libvpx/vp9/encoder/vp9_mcomp.c",
"libvpx/vp9/encoder/vp9_multi_thread.c",
"libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -330,7 +199,6 @@ libvpx_arm_c_srcs = [
"libvpx/vp9/encoder/vp9_speed_features.c",
"libvpx/vp9/encoder/vp9_subexp.c",
"libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
"libvpx/vp9/encoder/vp9_tokenize.c",
"libvpx/vp9/encoder/vp9_treewriter.c",
"libvpx/vp9/vp9_cx_iface.c",
@@ -352,6 +220,7 @@ libvpx_arm_c_srcs = [
"libvpx/vpx_dsp/psnr.c",
"libvpx/vpx_dsp/quantize.c",
"libvpx/vpx_dsp/sad.c",
+ "libvpx/vpx_dsp/skin_detection.c",
"libvpx/vpx_dsp/subtract.c",
"libvpx/vpx_dsp/sum_squares.c",
"libvpx/vpx_dsp/variance.c",
@@ -365,6 +234,7 @@ libvpx_arm_c_srcs = [
"libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/vpx_scale_rtcd.c",
"libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_util/vpx_write_yuv_frame.c",
"config/arm/vpx_config.c",
]
@@ -410,6 +280,7 @@ libvpx_arm64_c_srcs = [
"libvpx/vp8/common/swapyv12buffer.c",
"libvpx/vp8/common/treecoder.c",
"libvpx/vp8/common/vp8_loopfilter.c",
+ "libvpx/vp8/common/vp8_skin_detection.c",
"libvpx/vp8/decoder/dboolhuff.c",
"libvpx/vp8/decoder/decodeframe.c",
"libvpx/vp8/decoder/decodemv.c",
@@ -443,8 +314,6 @@ libvpx_arm64_c_srcs = [
"libvpx/vp8/encoder/vp8_quantize.c",
"libvpx/vp8/vp8_cx_iface.c",
"libvpx/vp8/vp8_dx_iface.c",
- "libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c",
- "libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c",
"libvpx/vp9/common/vp9_alloccommon.c",
"libvpx/vp9/common/vp9_blockd.c",
"libvpx/vp9/common/vp9_common_data.c",
@@ -471,9 +340,8 @@ libvpx_arm64_c_srcs = [
"libvpx/vp9/decoder/vp9_decoder.c",
"libvpx/vp9/decoder/vp9_detokenize.c",
"libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
- "libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
+ "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
"libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
"libvpx/vp9/encoder/vp9_alt_ref_aq.c",
"libvpx/vp9/encoder/vp9_aq_360.c",
@@ -490,10 +358,8 @@ libvpx_arm64_c_srcs = [
"libvpx/vp9/encoder/vp9_encoder.c",
"libvpx/vp9/encoder/vp9_ethread.c",
"libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
"libvpx/vp9/encoder/vp9_frame_scale.c",
"libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
"libvpx/vp9/encoder/vp9_mcomp.c",
"libvpx/vp9/encoder/vp9_multi_thread.c",
"libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -509,7 +375,6 @@ libvpx_arm64_c_srcs = [
"libvpx/vp9/encoder/vp9_speed_features.c",
"libvpx/vp9/encoder/vp9_subexp.c",
"libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
"libvpx/vp9/encoder/vp9_tokenize.c",
"libvpx/vp9/encoder/vp9_treewriter.c",
"libvpx/vp9/vp9_cx_iface.c",
@@ -519,9 +384,26 @@ libvpx_arm64_c_srcs = [
"libvpx/vpx/src/vpx_encoder.c",
"libvpx/vpx/src/vpx_image.c",
"libvpx/vpx_dsp/arm/avg_neon.c",
+ "libvpx/vpx_dsp/arm/avg_pred_neon.c",
+ "libvpx/vpx_dsp/arm/fdct16x16_neon.c",
+ "libvpx/vpx_dsp/arm/fdct32x32_neon.c",
"libvpx/vpx_dsp/arm/fdct_neon.c",
+ "libvpx/vpx_dsp/arm/fdct_partial_neon.c",
"libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"libvpx/vpx_dsp/arm/hadamard_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
+ "libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c",
"libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
"libvpx/vpx_dsp/arm/idct16x16_add_neon.c",
"libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
@@ -534,6 +416,7 @@ libvpx_arm64_c_srcs = [
"libvpx/vpx_dsp/arm/idct8x8_add_neon.c",
"libvpx/vpx_dsp/arm/intrapred_neon.c",
"libvpx/vpx_dsp/arm/loopfilter_neon.c",
+ "libvpx/vpx_dsp/arm/quantize_neon.c",
"libvpx/vpx_dsp/arm/sad4d_neon.c",
"libvpx/vpx_dsp/arm/sad_neon.c",
"libvpx/vpx_dsp/arm/subpel_variance_neon.c",
@@ -543,6 +426,7 @@ libvpx_arm64_c_srcs = [
"libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c",
"libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c",
"libvpx/vpx_dsp/arm/vpx_convolve_neon.c",
+ "libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c",
"libvpx/vpx_dsp/avg.c",
"libvpx/vpx_dsp/bitreader.c",
"libvpx/vpx_dsp/bitreader_buffer.c",
@@ -556,6 +440,7 @@ libvpx_arm64_c_srcs = [
"libvpx/vpx_dsp/psnr.c",
"libvpx/vpx_dsp/quantize.c",
"libvpx/vpx_dsp/sad.c",
+ "libvpx/vpx_dsp/skin_detection.c",
"libvpx/vpx_dsp/subtract.c",
"libvpx/vpx_dsp/sum_squares.c",
"libvpx/vpx_dsp/variance.c",
@@ -569,6 +454,7 @@ libvpx_arm64_c_srcs = [
"libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/vpx_scale_rtcd.c",
"libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_util/vpx_write_yuv_frame.c",
"config/arm64/vpx_config.c",
]
@@ -598,6 +484,7 @@ libvpx_generic_c_srcs = [
"libvpx/vp8/common/swapyv12buffer.c",
"libvpx/vp8/common/treecoder.c",
"libvpx/vp8/common/vp8_loopfilter.c",
+ "libvpx/vp8/common/vp8_skin_detection.c",
"libvpx/vp8/decoder/dboolhuff.c",
"libvpx/vp8/decoder/decodeframe.c",
"libvpx/vp8/decoder/decodemv.c",
@@ -653,7 +540,6 @@ libvpx_generic_c_srcs = [
"libvpx/vp9/decoder/vp9_decoder.c",
"libvpx/vp9/decoder/vp9_detokenize.c",
"libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/vp9_alt_ref_aq.c",
"libvpx/vp9/encoder/vp9_aq_360.c",
"libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -669,10 +555,8 @@ libvpx_generic_c_srcs = [
"libvpx/vp9/encoder/vp9_encoder.c",
"libvpx/vp9/encoder/vp9_ethread.c",
"libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
"libvpx/vp9/encoder/vp9_frame_scale.c",
"libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
"libvpx/vp9/encoder/vp9_mcomp.c",
"libvpx/vp9/encoder/vp9_multi_thread.c",
"libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -688,7 +572,6 @@ libvpx_generic_c_srcs = [
"libvpx/vp9/encoder/vp9_speed_features.c",
"libvpx/vp9/encoder/vp9_subexp.c",
"libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
"libvpx/vp9/encoder/vp9_tokenize.c",
"libvpx/vp9/encoder/vp9_treewriter.c",
"libvpx/vp9/vp9_cx_iface.c",
@@ -710,6 +593,7 @@ libvpx_generic_c_srcs = [
"libvpx/vpx_dsp/psnr.c",
"libvpx/vpx_dsp/quantize.c",
"libvpx/vpx_dsp/sad.c",
+ "libvpx/vpx_dsp/skin_detection.c",
"libvpx/vpx_dsp/subtract.c",
"libvpx/vpx_dsp/sum_squares.c",
"libvpx/vpx_dsp/variance.c",
@@ -722,152 +606,17 @@ libvpx_generic_c_srcs = [
"libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/vpx_scale_rtcd.c",
"libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_util/vpx_write_yuv_frame.c",
"config/generic/vpx_config.c",
]
libvpx_mips32_dspr2_c_srcs = [
- "libvpx/vp8/common/alloccommon.c",
- "libvpx/vp8/common/blockd.c",
- "libvpx/vp8/common/copy_c.c",
- "libvpx/vp8/common/dequantize.c",
- "libvpx/vp8/common/entropy.c",
- "libvpx/vp8/common/entropymode.c",
- "libvpx/vp8/common/entropymv.c",
- "libvpx/vp8/common/extend.c",
- "libvpx/vp8/common/filter.c",
- "libvpx/vp8/common/findnearmv.c",
- "libvpx/vp8/common/generic/systemdependent.c",
- "libvpx/vp8/common/idct_blk.c",
- "libvpx/vp8/common/idctllm.c",
- "libvpx/vp8/common/loopfilter_filters.c",
- "libvpx/vp8/common/mbpitch.c",
"libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c",
"libvpx/vp8/common/mips/dspr2/filter_dspr2.c",
"libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c",
"libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c",
"libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c",
"libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c",
- "libvpx/vp8/common/modecont.c",
- "libvpx/vp8/common/quant_common.c",
- "libvpx/vp8/common/reconinter.c",
- "libvpx/vp8/common/reconintra.c",
- "libvpx/vp8/common/reconintra4x4.c",
- "libvpx/vp8/common/rtcd.c",
- "libvpx/vp8/common/setupintrarecon.c",
- "libvpx/vp8/common/swapyv12buffer.c",
- "libvpx/vp8/common/treecoder.c",
- "libvpx/vp8/common/vp8_loopfilter.c",
- "libvpx/vp8/decoder/dboolhuff.c",
- "libvpx/vp8/decoder/decodeframe.c",
- "libvpx/vp8/decoder/decodemv.c",
- "libvpx/vp8/decoder/detokenize.c",
- "libvpx/vp8/decoder/onyxd_if.c",
- "libvpx/vp8/decoder/threading.c",
- "libvpx/vp8/encoder/bitstream.c",
- "libvpx/vp8/encoder/boolhuff.c",
- "libvpx/vp8/encoder/dct.c",
- "libvpx/vp8/encoder/denoising.c",
- "libvpx/vp8/encoder/encodeframe.c",
- "libvpx/vp8/encoder/encodeintra.c",
- "libvpx/vp8/encoder/encodemb.c",
- "libvpx/vp8/encoder/encodemv.c",
- "libvpx/vp8/encoder/ethreading.c",
- "libvpx/vp8/encoder/lookahead.c",
- "libvpx/vp8/encoder/mcomp.c",
- "libvpx/vp8/encoder/modecosts.c",
- "libvpx/vp8/encoder/onyx_if.c",
- "libvpx/vp8/encoder/pickinter.c",
- "libvpx/vp8/encoder/picklpf.c",
- "libvpx/vp8/encoder/ratectrl.c",
- "libvpx/vp8/encoder/rdopt.c",
- "libvpx/vp8/encoder/segmentation.c",
- "libvpx/vp8/encoder/tokenize.c",
- "libvpx/vp8/encoder/treewriter.c",
- "libvpx/vp8/encoder/vp8_quantize.c",
- "libvpx/vp8/vp8_cx_iface.c",
- "libvpx/vp8/vp8_dx_iface.c",
- "libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c",
- "libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c",
- "libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c",
- "libvpx/vp9/common/vp9_alloccommon.c",
- "libvpx/vp9/common/vp9_blockd.c",
- "libvpx/vp9/common/vp9_common_data.c",
- "libvpx/vp9/common/vp9_entropy.c",
- "libvpx/vp9/common/vp9_entropymode.c",
- "libvpx/vp9/common/vp9_entropymv.c",
- "libvpx/vp9/common/vp9_filter.c",
- "libvpx/vp9/common/vp9_frame_buffers.c",
- "libvpx/vp9/common/vp9_idct.c",
- "libvpx/vp9/common/vp9_loopfilter.c",
- "libvpx/vp9/common/vp9_mvref_common.c",
- "libvpx/vp9/common/vp9_pred_common.c",
- "libvpx/vp9/common/vp9_quant_common.c",
- "libvpx/vp9/common/vp9_reconinter.c",
- "libvpx/vp9/common/vp9_reconintra.c",
- "libvpx/vp9/common/vp9_rtcd.c",
- "libvpx/vp9/common/vp9_scale.c",
- "libvpx/vp9/common/vp9_scan.c",
- "libvpx/vp9/common/vp9_seg_common.c",
- "libvpx/vp9/common/vp9_thread_common.c",
- "libvpx/vp9/common/vp9_tile_common.c",
- "libvpx/vp9/decoder/vp9_decodeframe.c",
- "libvpx/vp9/decoder/vp9_decodemv.c",
- "libvpx/vp9/decoder/vp9_decoder.c",
- "libvpx/vp9/decoder/vp9_detokenize.c",
- "libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
- "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
- "libvpx/vp9/encoder/vp9_aq_360.c",
- "libvpx/vp9/encoder/vp9_aq_complexity.c",
- "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
- "libvpx/vp9/encoder/vp9_aq_variance.c",
- "libvpx/vp9/encoder/vp9_bitstream.c",
- "libvpx/vp9/encoder/vp9_context_tree.c",
- "libvpx/vp9/encoder/vp9_cost.c",
- "libvpx/vp9/encoder/vp9_dct.c",
- "libvpx/vp9/encoder/vp9_encodeframe.c",
- "libvpx/vp9/encoder/vp9_encodemb.c",
- "libvpx/vp9/encoder/vp9_encodemv.c",
- "libvpx/vp9/encoder/vp9_encoder.c",
- "libvpx/vp9/encoder/vp9_ethread.c",
- "libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
- "libvpx/vp9/encoder/vp9_frame_scale.c",
- "libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
- "libvpx/vp9/encoder/vp9_mcomp.c",
- "libvpx/vp9/encoder/vp9_multi_thread.c",
- "libvpx/vp9/encoder/vp9_noise_estimate.c",
- "libvpx/vp9/encoder/vp9_picklpf.c",
- "libvpx/vp9/encoder/vp9_pickmode.c",
- "libvpx/vp9/encoder/vp9_quantize.c",
- "libvpx/vp9/encoder/vp9_ratectrl.c",
- "libvpx/vp9/encoder/vp9_rd.c",
- "libvpx/vp9/encoder/vp9_rdopt.c",
- "libvpx/vp9/encoder/vp9_resize.c",
- "libvpx/vp9/encoder/vp9_segmentation.c",
- "libvpx/vp9/encoder/vp9_skin_detection.c",
- "libvpx/vp9/encoder/vp9_speed_features.c",
- "libvpx/vp9/encoder/vp9_subexp.c",
- "libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
- "libvpx/vp9/encoder/vp9_tokenize.c",
- "libvpx/vp9/encoder/vp9_treewriter.c",
- "libvpx/vp9/vp9_cx_iface.c",
- "libvpx/vp9/vp9_dx_iface.c",
- "libvpx/vpx/src/vpx_codec.c",
- "libvpx/vpx/src/vpx_decoder.c",
- "libvpx/vpx/src/vpx_encoder.c",
- "libvpx/vpx/src/vpx_image.c",
- "libvpx/vpx_dsp/avg.c",
- "libvpx/vpx_dsp/bitreader.c",
- "libvpx/vpx_dsp/bitreader_buffer.c",
- "libvpx/vpx_dsp/bitwriter.c",
- "libvpx/vpx_dsp/bitwriter_buffer.c",
- "libvpx/vpx_dsp/fwd_txfm.c",
- "libvpx/vpx_dsp/intrapred.c",
- "libvpx/vpx_dsp/inv_txfm.c",
- "libvpx/vpx_dsp/loopfilter.c",
"libvpx/vpx_dsp/mips/common_dspr2.c",
"libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c",
"libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c",
@@ -882,192 +631,38 @@ libvpx_mips32_dspr2_c_srcs = [
"libvpx/vpx_dsp/mips/intrapred16_dspr2.c",
"libvpx/vpx_dsp/mips/intrapred4_dspr2.c",
"libvpx/vpx_dsp/mips/intrapred8_dspr2.c",
- "libvpx/vpx_dsp/mips/itrans16_dspr2.c",
- "libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c",
- "libvpx/vpx_dsp/mips/itrans32_dspr2.c",
- "libvpx/vpx_dsp/mips/itrans4_dspr2.c",
- "libvpx/vpx_dsp/mips/itrans8_dspr2.c",
"libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c",
"libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c",
"libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c",
"libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c",
- "libvpx/vpx_dsp/prob.c",
- "libvpx/vpx_dsp/psnr.c",
- "libvpx/vpx_dsp/quantize.c",
- "libvpx/vpx_dsp/sad.c",
- "libvpx/vpx_dsp/subtract.c",
- "libvpx/vpx_dsp/sum_squares.c",
- "libvpx/vpx_dsp/variance.c",
- "libvpx/vpx_dsp/vpx_convolve.c",
- "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
- "libvpx/vpx_mem/vpx_mem.c",
- "libvpx/vpx_scale/generic/gen_scalers.c",
- "libvpx/vpx_scale/generic/vpx_scale.c",
- "libvpx/vpx_scale/generic/yv12config.c",
- "libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c",
- "libvpx/vpx_scale/vpx_scale_rtcd.c",
- "libvpx/vpx_util/vpx_thread.c",
"config/mips32-dspr2/vpx_config.c",
]
+libvpx_mips32_dspr2_exclude_c_srcs = [
+ "config/mips32/vpx_config.c",
+]
+
libvpx_mips32_msa_c_srcs = [
- "libvpx/vp8/common/alloccommon.c",
- "libvpx/vp8/common/blockd.c",
- "libvpx/vp8/common/copy_c.c",
- "libvpx/vp8/common/dequantize.c",
- "libvpx/vp8/common/entropy.c",
- "libvpx/vp8/common/entropymode.c",
- "libvpx/vp8/common/entropymv.c",
- "libvpx/vp8/common/extend.c",
- "libvpx/vp8/common/filter.c",
- "libvpx/vp8/common/findnearmv.c",
- "libvpx/vp8/common/generic/systemdependent.c",
- "libvpx/vp8/common/idct_blk.c",
- "libvpx/vp8/common/idctllm.c",
- "libvpx/vp8/common/loopfilter_filters.c",
- "libvpx/vp8/common/mbpitch.c",
"libvpx/vp8/common/mips/msa/bilinear_filter_msa.c",
"libvpx/vp8/common/mips/msa/copymem_msa.c",
"libvpx/vp8/common/mips/msa/idct_msa.c",
"libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c",
"libvpx/vp8/common/mips/msa/sixtap_filter_msa.c",
- "libvpx/vp8/common/modecont.c",
- "libvpx/vp8/common/quant_common.c",
- "libvpx/vp8/common/reconinter.c",
- "libvpx/vp8/common/reconintra.c",
- "libvpx/vp8/common/reconintra4x4.c",
- "libvpx/vp8/common/rtcd.c",
- "libvpx/vp8/common/setupintrarecon.c",
- "libvpx/vp8/common/swapyv12buffer.c",
- "libvpx/vp8/common/treecoder.c",
- "libvpx/vp8/common/vp8_loopfilter.c",
- "libvpx/vp8/decoder/dboolhuff.c",
- "libvpx/vp8/decoder/decodeframe.c",
- "libvpx/vp8/decoder/decodemv.c",
- "libvpx/vp8/decoder/detokenize.c",
- "libvpx/vp8/decoder/onyxd_if.c",
- "libvpx/vp8/decoder/threading.c",
- "libvpx/vp8/encoder/bitstream.c",
- "libvpx/vp8/encoder/boolhuff.c",
- "libvpx/vp8/encoder/dct.c",
- "libvpx/vp8/encoder/denoising.c",
- "libvpx/vp8/encoder/encodeframe.c",
- "libvpx/vp8/encoder/encodeintra.c",
- "libvpx/vp8/encoder/encodemb.c",
- "libvpx/vp8/encoder/encodemv.c",
- "libvpx/vp8/encoder/ethreading.c",
- "libvpx/vp8/encoder/lookahead.c",
- "libvpx/vp8/encoder/mcomp.c",
"libvpx/vp8/encoder/mips/msa/dct_msa.c",
"libvpx/vp8/encoder/mips/msa/denoising_msa.c",
"libvpx/vp8/encoder/mips/msa/encodeopt_msa.c",
"libvpx/vp8/encoder/mips/msa/quantize_msa.c",
- "libvpx/vp8/encoder/modecosts.c",
- "libvpx/vp8/encoder/onyx_if.c",
- "libvpx/vp8/encoder/pickinter.c",
- "libvpx/vp8/encoder/picklpf.c",
- "libvpx/vp8/encoder/ratectrl.c",
- "libvpx/vp8/encoder/rdopt.c",
- "libvpx/vp8/encoder/segmentation.c",
- "libvpx/vp8/encoder/tokenize.c",
- "libvpx/vp8/encoder/treewriter.c",
- "libvpx/vp8/encoder/vp8_quantize.c",
- "libvpx/vp8/vp8_cx_iface.c",
- "libvpx/vp8/vp8_dx_iface.c",
"libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c",
"libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c",
"libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c",
- "libvpx/vp9/common/vp9_alloccommon.c",
- "libvpx/vp9/common/vp9_blockd.c",
- "libvpx/vp9/common/vp9_common_data.c",
- "libvpx/vp9/common/vp9_entropy.c",
- "libvpx/vp9/common/vp9_entropymode.c",
- "libvpx/vp9/common/vp9_entropymv.c",
- "libvpx/vp9/common/vp9_filter.c",
- "libvpx/vp9/common/vp9_frame_buffers.c",
- "libvpx/vp9/common/vp9_idct.c",
- "libvpx/vp9/common/vp9_loopfilter.c",
- "libvpx/vp9/common/vp9_mvref_common.c",
- "libvpx/vp9/common/vp9_pred_common.c",
- "libvpx/vp9/common/vp9_quant_common.c",
- "libvpx/vp9/common/vp9_reconinter.c",
- "libvpx/vp9/common/vp9_reconintra.c",
- "libvpx/vp9/common/vp9_rtcd.c",
- "libvpx/vp9/common/vp9_scale.c",
- "libvpx/vp9/common/vp9_scan.c",
- "libvpx/vp9/common/vp9_seg_common.c",
- "libvpx/vp9/common/vp9_thread_common.c",
- "libvpx/vp9/common/vp9_tile_common.c",
- "libvpx/vp9/decoder/vp9_decodeframe.c",
- "libvpx/vp9/decoder/vp9_decodemv.c",
- "libvpx/vp9/decoder/vp9_decoder.c",
- "libvpx/vp9/decoder/vp9_detokenize.c",
- "libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/mips/msa/vp9_error_msa.c",
"libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c",
"libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c",
"libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c",
- "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
- "libvpx/vp9/encoder/vp9_aq_360.c",
- "libvpx/vp9/encoder/vp9_aq_complexity.c",
- "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
- "libvpx/vp9/encoder/vp9_aq_variance.c",
- "libvpx/vp9/encoder/vp9_bitstream.c",
- "libvpx/vp9/encoder/vp9_context_tree.c",
- "libvpx/vp9/encoder/vp9_cost.c",
- "libvpx/vp9/encoder/vp9_dct.c",
- "libvpx/vp9/encoder/vp9_encodeframe.c",
- "libvpx/vp9/encoder/vp9_encodemb.c",
- "libvpx/vp9/encoder/vp9_encodemv.c",
- "libvpx/vp9/encoder/vp9_encoder.c",
- "libvpx/vp9/encoder/vp9_ethread.c",
- "libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
- "libvpx/vp9/encoder/vp9_frame_scale.c",
- "libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
- "libvpx/vp9/encoder/vp9_mcomp.c",
- "libvpx/vp9/encoder/vp9_multi_thread.c",
- "libvpx/vp9/encoder/vp9_noise_estimate.c",
- "libvpx/vp9/encoder/vp9_picklpf.c",
- "libvpx/vp9/encoder/vp9_pickmode.c",
- "libvpx/vp9/encoder/vp9_quantize.c",
- "libvpx/vp9/encoder/vp9_ratectrl.c",
- "libvpx/vp9/encoder/vp9_rd.c",
- "libvpx/vp9/encoder/vp9_rdopt.c",
- "libvpx/vp9/encoder/vp9_resize.c",
- "libvpx/vp9/encoder/vp9_segmentation.c",
- "libvpx/vp9/encoder/vp9_skin_detection.c",
- "libvpx/vp9/encoder/vp9_speed_features.c",
- "libvpx/vp9/encoder/vp9_subexp.c",
- "libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
- "libvpx/vp9/encoder/vp9_tokenize.c",
- "libvpx/vp9/encoder/vp9_treewriter.c",
- "libvpx/vp9/vp9_cx_iface.c",
- "libvpx/vp9/vp9_dx_iface.c",
- "libvpx/vpx/src/vpx_codec.c",
- "libvpx/vpx/src/vpx_decoder.c",
- "libvpx/vpx/src/vpx_encoder.c",
- "libvpx/vpx/src/vpx_image.c",
- "libvpx/vpx_dsp/avg.c",
- "libvpx/vpx_dsp/bitreader.c",
- "libvpx/vpx_dsp/bitreader_buffer.c",
- "libvpx/vpx_dsp/bitwriter.c",
- "libvpx/vpx_dsp/bitwriter_buffer.c",
- "libvpx/vpx_dsp/fwd_txfm.c",
- "libvpx/vpx_dsp/intrapred.c",
- "libvpx/vpx_dsp/inv_txfm.c",
- "libvpx/vpx_dsp/loopfilter.c",
"libvpx/vpx_dsp/mips/avg_msa.c",
"libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c",
"libvpx/vpx_dsp/mips/fwd_txfm_msa.c",
- "libvpx/vpx_dsp/mips/idct16x16_msa.c",
- "libvpx/vpx_dsp/mips/idct32x32_msa.c",
- "libvpx/vpx_dsp/mips/idct4x4_msa.c",
- "libvpx/vpx_dsp/mips/idct8x8_msa.c",
"libvpx/vpx_dsp/mips/intrapred_msa.c",
"libvpx/vpx_dsp/mips/loopfilter_16_msa.c",
"libvpx/vpx_dsp/mips/loopfilter_4_msa.c",
@@ -1085,25 +680,13 @@ libvpx_mips32_msa_c_srcs = [
"libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c",
"libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c",
"libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c",
- "libvpx/vpx_dsp/prob.c",
- "libvpx/vpx_dsp/psnr.c",
- "libvpx/vpx_dsp/quantize.c",
- "libvpx/vpx_dsp/sad.c",
- "libvpx/vpx_dsp/subtract.c",
- "libvpx/vpx_dsp/sum_squares.c",
- "libvpx/vpx_dsp/variance.c",
- "libvpx/vpx_dsp/vpx_convolve.c",
- "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
- "libvpx/vpx_mem/vpx_mem.c",
- "libvpx/vpx_scale/generic/gen_scalers.c",
- "libvpx/vpx_scale/generic/vpx_scale.c",
- "libvpx/vpx_scale/generic/yv12config.c",
- "libvpx/vpx_scale/generic/yv12extend.c",
- "libvpx/vpx_scale/vpx_scale_rtcd.c",
- "libvpx/vpx_util/vpx_thread.c",
"config/mips32-msa/vpx_config.c",
]
+libvpx_mips32_msa_exclude_c_srcs = [
+ "config/mips32/vpx_config.c",
+]
+
libvpx_mips32_c_srcs = [
"libvpx/vp8/common/alloccommon.c",
"libvpx/vp8/common/blockd.c",
@@ -1130,6 +713,7 @@ libvpx_mips32_c_srcs = [
"libvpx/vp8/common/swapyv12buffer.c",
"libvpx/vp8/common/treecoder.c",
"libvpx/vp8/common/vp8_loopfilter.c",
+ "libvpx/vp8/common/vp8_skin_detection.c",
"libvpx/vp8/decoder/dboolhuff.c",
"libvpx/vp8/decoder/decodeframe.c",
"libvpx/vp8/decoder/decodemv.c",
@@ -1185,7 +769,6 @@ libvpx_mips32_c_srcs = [
"libvpx/vp9/decoder/vp9_decoder.c",
"libvpx/vp9/decoder/vp9_detokenize.c",
"libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/vp9_alt_ref_aq.c",
"libvpx/vp9/encoder/vp9_aq_360.c",
"libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1201,10 +784,8 @@ libvpx_mips32_c_srcs = [
"libvpx/vp9/encoder/vp9_encoder.c",
"libvpx/vp9/encoder/vp9_ethread.c",
"libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
"libvpx/vp9/encoder/vp9_frame_scale.c",
"libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
"libvpx/vp9/encoder/vp9_mcomp.c",
"libvpx/vp9/encoder/vp9_multi_thread.c",
"libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1220,7 +801,6 @@ libvpx_mips32_c_srcs = [
"libvpx/vp9/encoder/vp9_speed_features.c",
"libvpx/vp9/encoder/vp9_subexp.c",
"libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
"libvpx/vp9/encoder/vp9_tokenize.c",
"libvpx/vp9/encoder/vp9_treewriter.c",
"libvpx/vp9/vp9_cx_iface.c",
@@ -1242,6 +822,7 @@ libvpx_mips32_c_srcs = [
"libvpx/vpx_dsp/psnr.c",
"libvpx/vpx_dsp/quantize.c",
"libvpx/vpx_dsp/sad.c",
+ "libvpx/vpx_dsp/skin_detection.c",
"libvpx/vpx_dsp/subtract.c",
"libvpx/vpx_dsp/sum_squares.c",
"libvpx/vpx_dsp/variance.c",
@@ -1254,166 +835,30 @@ libvpx_mips32_c_srcs = [
"libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/vpx_scale_rtcd.c",
"libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_util/vpx_write_yuv_frame.c",
"config/mips32/vpx_config.c",
]
libvpx_mips64_msa_c_srcs = [
- "libvpx/vp8/common/alloccommon.c",
- "libvpx/vp8/common/blockd.c",
- "libvpx/vp8/common/copy_c.c",
- "libvpx/vp8/common/dequantize.c",
- "libvpx/vp8/common/entropy.c",
- "libvpx/vp8/common/entropymode.c",
- "libvpx/vp8/common/entropymv.c",
- "libvpx/vp8/common/extend.c",
- "libvpx/vp8/common/filter.c",
- "libvpx/vp8/common/findnearmv.c",
- "libvpx/vp8/common/generic/systemdependent.c",
- "libvpx/vp8/common/idct_blk.c",
- "libvpx/vp8/common/idctllm.c",
- "libvpx/vp8/common/loopfilter_filters.c",
- "libvpx/vp8/common/mbpitch.c",
"libvpx/vp8/common/mips/msa/bilinear_filter_msa.c",
"libvpx/vp8/common/mips/msa/copymem_msa.c",
"libvpx/vp8/common/mips/msa/idct_msa.c",
"libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c",
"libvpx/vp8/common/mips/msa/sixtap_filter_msa.c",
- "libvpx/vp8/common/modecont.c",
- "libvpx/vp8/common/quant_common.c",
- "libvpx/vp8/common/reconinter.c",
- "libvpx/vp8/common/reconintra.c",
- "libvpx/vp8/common/reconintra4x4.c",
- "libvpx/vp8/common/rtcd.c",
- "libvpx/vp8/common/setupintrarecon.c",
- "libvpx/vp8/common/swapyv12buffer.c",
- "libvpx/vp8/common/treecoder.c",
- "libvpx/vp8/common/vp8_loopfilter.c",
- "libvpx/vp8/decoder/dboolhuff.c",
- "libvpx/vp8/decoder/decodeframe.c",
- "libvpx/vp8/decoder/decodemv.c",
- "libvpx/vp8/decoder/detokenize.c",
- "libvpx/vp8/decoder/onyxd_if.c",
- "libvpx/vp8/decoder/threading.c",
- "libvpx/vp8/encoder/bitstream.c",
- "libvpx/vp8/encoder/boolhuff.c",
- "libvpx/vp8/encoder/dct.c",
- "libvpx/vp8/encoder/denoising.c",
- "libvpx/vp8/encoder/encodeframe.c",
- "libvpx/vp8/encoder/encodeintra.c",
- "libvpx/vp8/encoder/encodemb.c",
- "libvpx/vp8/encoder/encodemv.c",
- "libvpx/vp8/encoder/ethreading.c",
- "libvpx/vp8/encoder/lookahead.c",
- "libvpx/vp8/encoder/mcomp.c",
"libvpx/vp8/encoder/mips/msa/dct_msa.c",
"libvpx/vp8/encoder/mips/msa/denoising_msa.c",
"libvpx/vp8/encoder/mips/msa/encodeopt_msa.c",
"libvpx/vp8/encoder/mips/msa/quantize_msa.c",
- "libvpx/vp8/encoder/modecosts.c",
- "libvpx/vp8/encoder/onyx_if.c",
- "libvpx/vp8/encoder/pickinter.c",
- "libvpx/vp8/encoder/picklpf.c",
- "libvpx/vp8/encoder/ratectrl.c",
- "libvpx/vp8/encoder/rdopt.c",
- "libvpx/vp8/encoder/segmentation.c",
- "libvpx/vp8/encoder/tokenize.c",
- "libvpx/vp8/encoder/treewriter.c",
- "libvpx/vp8/encoder/vp8_quantize.c",
- "libvpx/vp8/vp8_cx_iface.c",
- "libvpx/vp8/vp8_dx_iface.c",
"libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c",
"libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c",
"libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c",
- "libvpx/vp9/common/vp9_alloccommon.c",
- "libvpx/vp9/common/vp9_blockd.c",
- "libvpx/vp9/common/vp9_common_data.c",
- "libvpx/vp9/common/vp9_entropy.c",
- "libvpx/vp9/common/vp9_entropymode.c",
- "libvpx/vp9/common/vp9_entropymv.c",
- "libvpx/vp9/common/vp9_filter.c",
- "libvpx/vp9/common/vp9_frame_buffers.c",
- "libvpx/vp9/common/vp9_idct.c",
- "libvpx/vp9/common/vp9_loopfilter.c",
- "libvpx/vp9/common/vp9_mvref_common.c",
- "libvpx/vp9/common/vp9_pred_common.c",
- "libvpx/vp9/common/vp9_quant_common.c",
- "libvpx/vp9/common/vp9_reconinter.c",
- "libvpx/vp9/common/vp9_reconintra.c",
- "libvpx/vp9/common/vp9_rtcd.c",
- "libvpx/vp9/common/vp9_scale.c",
- "libvpx/vp9/common/vp9_scan.c",
- "libvpx/vp9/common/vp9_seg_common.c",
- "libvpx/vp9/common/vp9_thread_common.c",
- "libvpx/vp9/common/vp9_tile_common.c",
- "libvpx/vp9/decoder/vp9_decodeframe.c",
- "libvpx/vp9/decoder/vp9_decodemv.c",
- "libvpx/vp9/decoder/vp9_decoder.c",
- "libvpx/vp9/decoder/vp9_detokenize.c",
- "libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/mips/msa/vp9_error_msa.c",
"libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c",
"libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c",
"libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c",
- "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
- "libvpx/vp9/encoder/vp9_aq_360.c",
- "libvpx/vp9/encoder/vp9_aq_complexity.c",
- "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
- "libvpx/vp9/encoder/vp9_aq_variance.c",
- "libvpx/vp9/encoder/vp9_bitstream.c",
- "libvpx/vp9/encoder/vp9_context_tree.c",
- "libvpx/vp9/encoder/vp9_cost.c",
- "libvpx/vp9/encoder/vp9_dct.c",
- "libvpx/vp9/encoder/vp9_encodeframe.c",
- "libvpx/vp9/encoder/vp9_encodemb.c",
- "libvpx/vp9/encoder/vp9_encodemv.c",
- "libvpx/vp9/encoder/vp9_encoder.c",
- "libvpx/vp9/encoder/vp9_ethread.c",
- "libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
- "libvpx/vp9/encoder/vp9_frame_scale.c",
- "libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
- "libvpx/vp9/encoder/vp9_mcomp.c",
- "libvpx/vp9/encoder/vp9_multi_thread.c",
- "libvpx/vp9/encoder/vp9_noise_estimate.c",
- "libvpx/vp9/encoder/vp9_picklpf.c",
- "libvpx/vp9/encoder/vp9_pickmode.c",
- "libvpx/vp9/encoder/vp9_quantize.c",
- "libvpx/vp9/encoder/vp9_ratectrl.c",
- "libvpx/vp9/encoder/vp9_rd.c",
- "libvpx/vp9/encoder/vp9_rdopt.c",
- "libvpx/vp9/encoder/vp9_resize.c",
- "libvpx/vp9/encoder/vp9_segmentation.c",
- "libvpx/vp9/encoder/vp9_skin_detection.c",
- "libvpx/vp9/encoder/vp9_speed_features.c",
- "libvpx/vp9/encoder/vp9_subexp.c",
- "libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
- "libvpx/vp9/encoder/vp9_tokenize.c",
- "libvpx/vp9/encoder/vp9_treewriter.c",
- "libvpx/vp9/vp9_cx_iface.c",
- "libvpx/vp9/vp9_dx_iface.c",
- "libvpx/vpx/src/vpx_codec.c",
- "libvpx/vpx/src/vpx_decoder.c",
- "libvpx/vpx/src/vpx_encoder.c",
- "libvpx/vpx/src/vpx_image.c",
- "libvpx/vpx_dsp/avg.c",
- "libvpx/vpx_dsp/bitreader.c",
- "libvpx/vpx_dsp/bitreader_buffer.c",
- "libvpx/vpx_dsp/bitwriter.c",
- "libvpx/vpx_dsp/bitwriter_buffer.c",
- "libvpx/vpx_dsp/fwd_txfm.c",
- "libvpx/vpx_dsp/intrapred.c",
- "libvpx/vpx_dsp/inv_txfm.c",
- "libvpx/vpx_dsp/loopfilter.c",
"libvpx/vpx_dsp/mips/avg_msa.c",
"libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c",
"libvpx/vpx_dsp/mips/fwd_txfm_msa.c",
- "libvpx/vpx_dsp/mips/idct16x16_msa.c",
- "libvpx/vpx_dsp/mips/idct32x32_msa.c",
- "libvpx/vpx_dsp/mips/idct4x4_msa.c",
- "libvpx/vpx_dsp/mips/idct8x8_msa.c",
"libvpx/vpx_dsp/mips/intrapred_msa.c",
"libvpx/vpx_dsp/mips/loopfilter_16_msa.c",
"libvpx/vpx_dsp/mips/loopfilter_4_msa.c",
@@ -1431,25 +876,13 @@ libvpx_mips64_msa_c_srcs = [
"libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c",
"libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c",
"libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c",
- "libvpx/vpx_dsp/prob.c",
- "libvpx/vpx_dsp/psnr.c",
- "libvpx/vpx_dsp/quantize.c",
- "libvpx/vpx_dsp/sad.c",
- "libvpx/vpx_dsp/subtract.c",
- "libvpx/vpx_dsp/sum_squares.c",
- "libvpx/vpx_dsp/variance.c",
- "libvpx/vpx_dsp/vpx_convolve.c",
- "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
- "libvpx/vpx_mem/vpx_mem.c",
- "libvpx/vpx_scale/generic/gen_scalers.c",
- "libvpx/vpx_scale/generic/vpx_scale.c",
- "libvpx/vpx_scale/generic/yv12config.c",
- "libvpx/vpx_scale/generic/yv12extend.c",
- "libvpx/vpx_scale/vpx_scale_rtcd.c",
- "libvpx/vpx_util/vpx_thread.c",
"config/mips64-msa/vpx_config.c",
]
+libvpx_mips64_msa_exclude_c_srcs = [
+ "config/mips64/vpx_config.c",
+]
+
libvpx_mips64_c_srcs = [
"libvpx/vp8/common/alloccommon.c",
"libvpx/vp8/common/blockd.c",
@@ -1476,6 +909,7 @@ libvpx_mips64_c_srcs = [
"libvpx/vp8/common/swapyv12buffer.c",
"libvpx/vp8/common/treecoder.c",
"libvpx/vp8/common/vp8_loopfilter.c",
+ "libvpx/vp8/common/vp8_skin_detection.c",
"libvpx/vp8/decoder/dboolhuff.c",
"libvpx/vp8/decoder/decodeframe.c",
"libvpx/vp8/decoder/decodemv.c",
@@ -1531,7 +965,6 @@ libvpx_mips64_c_srcs = [
"libvpx/vp9/decoder/vp9_decoder.c",
"libvpx/vp9/decoder/vp9_detokenize.c",
"libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/vp9_alt_ref_aq.c",
"libvpx/vp9/encoder/vp9_aq_360.c",
"libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1547,10 +980,8 @@ libvpx_mips64_c_srcs = [
"libvpx/vp9/encoder/vp9_encoder.c",
"libvpx/vp9/encoder/vp9_ethread.c",
"libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
"libvpx/vp9/encoder/vp9_frame_scale.c",
"libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
"libvpx/vp9/encoder/vp9_mcomp.c",
"libvpx/vp9/encoder/vp9_multi_thread.c",
"libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1566,7 +997,6 @@ libvpx_mips64_c_srcs = [
"libvpx/vp9/encoder/vp9_speed_features.c",
"libvpx/vp9/encoder/vp9_subexp.c",
"libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
"libvpx/vp9/encoder/vp9_tokenize.c",
"libvpx/vp9/encoder/vp9_treewriter.c",
"libvpx/vp9/vp9_cx_iface.c",
@@ -1588,6 +1018,7 @@ libvpx_mips64_c_srcs = [
"libvpx/vpx_dsp/psnr.c",
"libvpx/vpx_dsp/quantize.c",
"libvpx/vpx_dsp/sad.c",
+ "libvpx/vpx_dsp/skin_detection.c",
"libvpx/vpx_dsp/subtract.c",
"libvpx/vpx_dsp/sum_squares.c",
"libvpx/vpx_dsp/variance.c",
@@ -1600,6 +1031,7 @@ libvpx_mips64_c_srcs = [
"libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/vpx_scale_rtcd.c",
"libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_util/vpx_write_yuv_frame.c",
"config/mips64/vpx_config.c",
]
@@ -1631,6 +1063,7 @@ libvpx_x86_c_srcs = [
"libvpx/vp8/common/swapyv12buffer.c",
"libvpx/vp8/common/treecoder.c",
"libvpx/vp8/common/vp8_loopfilter.c",
+ "libvpx/vp8/common/vp8_skin_detection.c",
"libvpx/vp8/common/x86/filter_x86.c",
"libvpx/vp8/common/x86/idct_blk_mmx.c",
"libvpx/vp8/common/x86/idct_blk_sse2.c",
@@ -1664,10 +1097,9 @@ libvpx_x86_c_srcs = [
"libvpx/vp8/encoder/treewriter.c",
"libvpx/vp8/encoder/vp8_quantize.c",
"libvpx/vp8/encoder/x86/denoising_sse2.c",
- "libvpx/vp8/encoder/x86/quantize_ssse3.c",
- "libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c",
"libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c",
"libvpx/vp8/encoder/x86/vp8_quantize_sse2.c",
+ "libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c",
"libvpx/vp8/vp8_cx_iface.c",
"libvpx/vp8/vp8_dx_iface.c",
"libvpx/vp9/common/vp9_alloccommon.c",
@@ -1697,7 +1129,6 @@ libvpx_x86_c_srcs = [
"libvpx/vp9/decoder/vp9_decoder.c",
"libvpx/vp9/decoder/vp9_detokenize.c",
"libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/vp9_alt_ref_aq.c",
"libvpx/vp9/encoder/vp9_aq_360.c",
"libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1713,10 +1144,8 @@ libvpx_x86_c_srcs = [
"libvpx/vp9/encoder/vp9_encoder.c",
"libvpx/vp9/encoder/vp9_ethread.c",
"libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
"libvpx/vp9/encoder/vp9_frame_scale.c",
"libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
"libvpx/vp9/encoder/vp9_mcomp.c",
"libvpx/vp9/encoder/vp9_multi_thread.c",
"libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1732,12 +1161,12 @@ libvpx_x86_c_srcs = [
"libvpx/vp9/encoder/vp9_speed_features.c",
"libvpx/vp9/encoder/vp9_subexp.c",
"libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
"libvpx/vp9/encoder/vp9_tokenize.c",
"libvpx/vp9/encoder/vp9_treewriter.c",
"libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c",
"libvpx/vp9/encoder/x86/vp9_dct_ssse3.c",
"libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c",
+ "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
"libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
"libvpx/vp9/vp9_cx_iface.c",
"libvpx/vp9/vp9_dx_iface.c",
@@ -1760,6 +1189,7 @@ libvpx_x86_c_srcs = [
"libvpx/vpx_dsp/psnr.c",
"libvpx/vpx_dsp/quantize.c",
"libvpx/vpx_dsp/sad.c",
+ "libvpx/vpx_dsp/skin_detection.c",
"libvpx/vpx_dsp/subtract.c",
"libvpx/vpx_dsp/sum_squares.c",
"libvpx/vpx_dsp/variance.c",
@@ -1768,10 +1198,20 @@ libvpx_x86_c_srcs = [
"libvpx/vpx_dsp/x86/avg_intrin_sse2.c",
"libvpx/vpx_dsp/x86/avg_pred_sse2.c",
"libvpx/vpx_dsp/x86/fwd_txfm_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c",
+ "libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_variance_sse2.c",
"libvpx/vpx_dsp/x86/inv_txfm_sse2.c",
"libvpx/vpx_dsp/x86/inv_txfm_ssse3.c",
"libvpx/vpx_dsp/x86/loopfilter_sse2.c",
"libvpx/vpx_dsp/x86/quantize_sse2.c",
+ "libvpx/vpx_dsp/x86/quantize_ssse3.c",
"libvpx/vpx_dsp/x86/sum_squares_sse2.c",
"libvpx/vpx_dsp/x86/variance_sse2.c",
"libvpx/vpx_dsp/x86/vpx_asm_stubs.c",
@@ -1783,6 +1223,7 @@ libvpx_x86_c_srcs = [
"libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/vpx_scale_rtcd.c",
"libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_util/vpx_write_yuv_frame.c",
"config/x86/vpx_config.c",
]
@@ -1803,11 +1244,15 @@ libvpx_x86_asm_srcs = [
"libvpx/vp8/encoder/x86/dct_sse2.asm",
"libvpx/vp8/encoder/x86/encodeopt.asm",
"libvpx/vp8/encoder/x86/fwalsh_sse2.asm",
- "libvpx/vp8/encoder/x86/quantize_mmx.asm",
"libvpx/vp9/encoder/x86/vp9_dct_sse2.asm",
"libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
"libvpx/vpx_dsp/x86/add_noise_sse2.asm",
"libvpx/vpx_dsp/x86/deblock_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_sad_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm",
"libvpx/vpx_dsp/x86/intrapred_sse2.asm",
"libvpx/vpx_dsp/x86/intrapred_ssse3.asm",
"libvpx/vpx_dsp/x86/inv_wht_sse2.asm",
@@ -1818,6 +1263,8 @@ libvpx_x86_asm_srcs = [
"libvpx/vpx_dsp/x86/subpel_variance_sse2.asm",
"libvpx/vpx_dsp/x86/subtract_sse2.asm",
"libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm",
+ "libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm",
+ "libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm",
"libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm",
"libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm",
"libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm",
@@ -1854,6 +1301,7 @@ libvpx_x86_64_c_srcs = [
"libvpx/vp8/common/swapyv12buffer.c",
"libvpx/vp8/common/treecoder.c",
"libvpx/vp8/common/vp8_loopfilter.c",
+ "libvpx/vp8/common/vp8_skin_detection.c",
"libvpx/vp8/common/x86/filter_x86.c",
"libvpx/vp8/common/x86/idct_blk_mmx.c",
"libvpx/vp8/common/x86/idct_blk_sse2.c",
@@ -1887,10 +1335,9 @@ libvpx_x86_64_c_srcs = [
"libvpx/vp8/encoder/treewriter.c",
"libvpx/vp8/encoder/vp8_quantize.c",
"libvpx/vp8/encoder/x86/denoising_sse2.c",
- "libvpx/vp8/encoder/x86/quantize_ssse3.c",
- "libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c",
"libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c",
"libvpx/vp8/encoder/x86/vp8_quantize_sse2.c",
+ "libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c",
"libvpx/vp8/vp8_cx_iface.c",
"libvpx/vp8/vp8_dx_iface.c",
"libvpx/vp9/common/vp9_alloccommon.c",
@@ -1920,7 +1367,6 @@ libvpx_x86_64_c_srcs = [
"libvpx/vp9/decoder/vp9_decoder.c",
"libvpx/vp9/decoder/vp9_detokenize.c",
"libvpx/vp9/decoder/vp9_dsubexp.c",
- "libvpx/vp9/decoder/vp9_dthread.c",
"libvpx/vp9/encoder/vp9_alt_ref_aq.c",
"libvpx/vp9/encoder/vp9_aq_360.c",
"libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1936,10 +1382,8 @@ libvpx_x86_64_c_srcs = [
"libvpx/vp9/encoder/vp9_encoder.c",
"libvpx/vp9/encoder/vp9_ethread.c",
"libvpx/vp9/encoder/vp9_extend.c",
- "libvpx/vp9/encoder/vp9_firstpass.c",
"libvpx/vp9/encoder/vp9_frame_scale.c",
"libvpx/vp9/encoder/vp9_lookahead.c",
- "libvpx/vp9/encoder/vp9_mbgraph.c",
"libvpx/vp9/encoder/vp9_mcomp.c",
"libvpx/vp9/encoder/vp9_multi_thread.c",
"libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1955,12 +1399,12 @@ libvpx_x86_64_c_srcs = [
"libvpx/vp9/encoder/vp9_speed_features.c",
"libvpx/vp9/encoder/vp9_subexp.c",
"libvpx/vp9/encoder/vp9_svc_layercontext.c",
- "libvpx/vp9/encoder/vp9_temporal_filter.c",
"libvpx/vp9/encoder/vp9_tokenize.c",
"libvpx/vp9/encoder/vp9_treewriter.c",
"libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c",
"libvpx/vp9/encoder/x86/vp9_dct_ssse3.c",
"libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c",
+ "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
"libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
"libvpx/vp9/vp9_cx_iface.c",
"libvpx/vp9/vp9_dx_iface.c",
@@ -1983,6 +1427,7 @@ libvpx_x86_64_c_srcs = [
"libvpx/vpx_dsp/psnr.c",
"libvpx/vpx_dsp/quantize.c",
"libvpx/vpx_dsp/sad.c",
+ "libvpx/vpx_dsp/skin_detection.c",
"libvpx/vpx_dsp/subtract.c",
"libvpx/vpx_dsp/sum_squares.c",
"libvpx/vpx_dsp/variance.c",
@@ -1991,10 +1436,20 @@ libvpx_x86_64_c_srcs = [
"libvpx/vpx_dsp/x86/avg_intrin_sse2.c",
"libvpx/vpx_dsp/x86/avg_pred_sse2.c",
"libvpx/vpx_dsp/x86/fwd_txfm_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c",
+ "libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c",
+ "libvpx/vpx_dsp/x86/highbd_variance_sse2.c",
"libvpx/vpx_dsp/x86/inv_txfm_sse2.c",
"libvpx/vpx_dsp/x86/inv_txfm_ssse3.c",
"libvpx/vpx_dsp/x86/loopfilter_sse2.c",
"libvpx/vpx_dsp/x86/quantize_sse2.c",
+ "libvpx/vpx_dsp/x86/quantize_ssse3.c",
"libvpx/vpx_dsp/x86/sum_squares_sse2.c",
"libvpx/vpx_dsp/x86/variance_sse2.c",
"libvpx/vpx_dsp/x86/vpx_asm_stubs.c",
@@ -2006,6 +1461,7 @@ libvpx_x86_64_c_srcs = [
"libvpx/vpx_scale/generic/yv12extend.c",
"libvpx/vpx_scale/vpx_scale_rtcd.c",
"libvpx/vpx_util/vpx_thread.c",
+ "libvpx/vpx_util/vpx_write_yuv_frame.c",
"config/x86_64/vpx_config.c",
]
@@ -2027,7 +1483,6 @@ libvpx_x86_64_asm_srcs = [
"libvpx/vp8/encoder/x86/dct_sse2.asm",
"libvpx/vp8/encoder/x86/encodeopt.asm",
"libvpx/vp8/encoder/x86/fwalsh_sse2.asm",
- "libvpx/vp8/encoder/x86/quantize_mmx.asm",
"libvpx/vp9/encoder/x86/vp9_dct_sse2.asm",
"libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
"libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm",
@@ -2035,10 +1490,14 @@ libvpx_x86_64_asm_srcs = [
"libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm",
"libvpx/vpx_dsp/x86/deblock_sse2.asm",
"libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm",
+ "libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_sad_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm",
+ "libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm",
"libvpx/vpx_dsp/x86/intrapred_sse2.asm",
"libvpx/vpx_dsp/x86/intrapred_ssse3.asm",
"libvpx/vpx_dsp/x86/inv_wht_sse2.asm",
- "libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm",
"libvpx/vpx_dsp/x86/sad4d_sse2.asm",
"libvpx/vpx_dsp/x86/sad_sse2.asm",
"libvpx/vpx_dsp/x86/sad_sse3.asm",
@@ -2047,6 +1506,8 @@ libvpx_x86_64_asm_srcs = [
"libvpx/vpx_dsp/x86/subpel_variance_sse2.asm",
"libvpx/vpx_dsp/x86/subtract_sse2.asm",
"libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm",
+ "libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm",
+ "libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm",
"libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm",
"libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm",
"libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm",
@@ -2086,7 +1547,7 @@ cc_library_static {
local_include_dirs: ["config/arm"],
neon: {
- exclude_srcs: libvpx_arm_c_srcs,
+ exclude_srcs: libvpx_arm_neon_exclude_c_srcs,
srcs: libvpx_arm_neon_c_srcs,
generated_sources: ["libvpx_arm_neon_asm_srcs_converted"],
local_include_dirs: ["config/arm-neon"],
@@ -2103,13 +1564,13 @@ cc_library_static {
local_include_dirs: ["config/mips32"],
dspr2: {
- exclude_srcs: libvpx_mips32_c_srcs,
+ exclude_srcs: libvpx_mips32_dspr2_exclude_c_srcs,
srcs: libvpx_mips32_dspr2_c_srcs,
local_include_dirs: ["config/mips32-dspr2"],
},
msa: {
- exclude_srcs: libvpx_mips32_c_srcs,
+ exclude_srcs: libvpx_mips32_msa_exclude_c_srcs,
srcs: libvpx_mips32_msa_c_srcs,
local_include_dirs: ["config/mips32-msa"],
},
@@ -2120,7 +1581,7 @@ cc_library_static {
local_include_dirs: ["config/mips64"],
msa: {
- exclude_srcs: libvpx_mips64_c_srcs,
+ exclude_srcs: libvpx_mips64_msa_exclude_c_srcs,
srcs: libvpx_mips64_msa_c_srcs,
local_include_dirs: ["config/mips64-msa"],
},
diff --git a/Android.bp.in b/Android.bp.in
index ac6a46435..0fb7c9581 100644
--- a/Android.bp.in
+++ b/Android.bp.in
@@ -15,6 +15,7 @@ gensrcs {
cc_library_static {
name: "libvpx",
+ vendor_available: true,
arch: {
arm: {
@@ -28,7 +29,7 @@ cc_library_static {
local_include_dirs: ["config/arm"],
neon: {
- exclude_srcs: libvpx_arm_c_srcs,
+ exclude_srcs: libvpx_arm_neon_exclude_c_srcs,
srcs: libvpx_arm_neon_c_srcs,
generated_sources: ["libvpx_arm_neon_asm_srcs_converted"],
local_include_dirs: ["config/arm-neon"],
@@ -45,13 +46,13 @@ cc_library_static {
local_include_dirs: ["config/mips32"],
dspr2: {
- exclude_srcs: libvpx_mips32_c_srcs,
+ exclude_srcs: libvpx_mips32_dspr2_exclude_c_srcs,
srcs: libvpx_mips32_dspr2_c_srcs,
local_include_dirs: ["config/mips32-dspr2"],
},
msa: {
- exclude_srcs: libvpx_mips32_c_srcs,
+ exclude_srcs: libvpx_mips32_msa_exclude_c_srcs,
srcs: libvpx_mips32_msa_c_srcs,
local_include_dirs: ["config/mips32-msa"],
},
@@ -62,7 +63,7 @@ cc_library_static {
local_include_dirs: ["config/mips64"],
msa: {
- exclude_srcs: libvpx_mips64_c_srcs,
+ exclude_srcs: libvpx_mips64_msa_exclude_c_srcs,
srcs: libvpx_mips64_msa_c_srcs,
local_include_dirs: ["config/mips64-msa"],
},
diff --git a/CleanSpec.mk b/CleanSpec.mk
new file mode 100644
index 000000000..cac3d3bc5
--- /dev/null
+++ b/CleanSpec.mk
@@ -0,0 +1,53 @@
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# If you don't need to do a full clean build but would like to touch
+# a file or delete some intermediate files, add a clean step to the end
+# of the list. These steps will only be run once, if they haven't been
+# run before.
+#
+# E.g.:
+# $(call add-clean-step, touch -c external/sqlite/sqlite3.h)
+# $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libz_intermediates)
+#
+# Always use "touch -c" and "rm -f" or "rm -rf" to gracefully deal with
+# files that are missing or have been moved.
+#
+# Use $(PRODUCT_OUT) to get to the "out/target/product/blah/" directory.
+# Use $(OUT_DIR) to refer to the "out" directory.
+#
+# If you need to re-do something that's already mentioned, just copy
+# the command and add it to the bottom of the list. E.g., if a change
+# that you made last week required touching a file and a change you
+# made today requires touching the same file, just copy the old
+# touch step and add it to the end of the list.
+#
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
+
+# For example:
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/AndroidTests_intermediates)
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/JAVA_LIBRARIES/core_intermediates)
+#$(call add-clean-step, find $(OUT_DIR) -type f -name "IGTalkSession*" -print0 | xargs -0 rm -f)
+#$(call add-clean-step, rm -rf $(PRODUCT_OUT)/data/*)
+
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
+
+# vpx_config.asm change
+$(call add-clean-step, rm -rf $(OUT_DIR)/soong/.intermediates/external/libvpx/libvpx)
+
diff --git a/README.android b/README.android
index 92a84980b..92739cd7b 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
Name: libvpx
URL: http://www.webmproject.org
-Version: v1.6.1-665-gbcfd9c975
+Version: v1.7.0
License: BSD
License File: libvpx/LICENSE
-Date: Tuesday May 23 2017
-Branch: origin/master
-Commit: bcfd9c97508531a81cc2f5d393edb9eb1b00ce79
+Date: Wednesday January 24 2018
+Branch: origin/mandarinduck
+Commit: f80be22a1099b2a431c2796f529bb261064ec6b4
Description:
Contains the sources used to compile libvpx.
diff --git a/README.version b/README.version
index 07913c812..c6c6a3724 100644
--- a/README.version
+++ b/README.version
@@ -1,4 +1,6 @@
-URL: https://chromium.googlesource.com/webm/libvpx.git/+archive/bcfd9c97508531a81cc2f5d393edb9eb1b00ce79.tar.gz
-Version: v1.6.1-665-gbcfd9c975
+URL: https://chromium.googlesource.com/webm/libvpx.git/+archive/v1.7.0.tar.gz
+Version: v1.7.0
BugComponent: 42195
Owners: johannkoenig
+Local Modifications:
+ Add visibility="protected" attribute for global variables referenced in asm files.
diff --git a/config/arm-neon/vp8_rtcd.h b/config/arm-neon/vp8_rtcd.h
index 3f112f6f7..4eb59c663 100644
--- a/config/arm-neon/vp8_rtcd.h
+++ b/config/arm-neon/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h
index 1df16205a..0f4f04d1f 100644
--- a/config/arm-neon/vp9_rtcd.h
+++ b/config/arm-neon/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -33,9 +34,8 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_neon
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -53,35 +53,62 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_c
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_neon
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon
void vp9_rtcd(void);
diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm
index fdeb46a67..0a0b1d240 100644
--- a/config/arm-neon/vpx_config.asm
+++ b/config/arm-neon/vpx_config.asm
@@ -20,7 +20,9 @@
.equ HAVE_SSE4_1 , 0
.equ HAVE_AVX , 0
.equ HAVE_AVX2 , 0
+.equ HAVE_AVX512 , 0
.equ HAVE_VSX , 0
+.equ HAVE_MMI , 0
.equ HAVE_VPX_PORTS , 1
.equ HAVE_PTHREAD_H , 1
.equ HAVE_UNISTD_H , 1
@@ -74,10 +76,11 @@
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
-.equ CONFIG_VP9_HIGHBITDEPTH , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 1
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 1
+.equ CONFIG_ALWAYS_ADJUST_BPM , 0
.equ CONFIG_SPATIAL_SVC , 0
.equ CONFIG_FP_MB_STATS , 0
.equ CONFIG_EMULATE_HARDWARE , 0
diff --git a/config/arm-neon/vpx_config.c b/config/arm-neon/vpx_config.c
index 0eb0a305c..95e12998c 100644
--- a/config/arm-neon/vpx_config.c
+++ b/config/arm-neon/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/arm-neon/vpx_config.h b/config/arm-neon/vpx_config.h
index d632a2191..e9d645653 100644
--- a/config/arm-neon/vpx_config.h
+++ b/config/arm-neon/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
index a915afabf..d911fd37f 100644
--- a/config/arm-neon/vpx_dsp_rtcd.h
+++ b/config/arm-neon/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -28,38 +30,39 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p);
#define vpx_avg_8x8 vpx_avg_8x8_neon
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_neon
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_neon
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_neon
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_neon
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_neon
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_neon
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_neon
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -213,26 +216,32 @@ void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8
#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_c
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_neon
void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon
void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_c
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_neon
void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon
void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4 vpx_fdct4x4_neon
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon
void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -273,17 +282,915 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_neon
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_neon
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_neon
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_neon
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_neon
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_neon
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_neon
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_neon
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_neon
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_neon
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_neon
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_neon
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_neon
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_neon
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_neon
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_neon
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_neon
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_neon
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_neon
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_neon
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_neon
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_neon
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_neon
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_neon
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_neon
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_neon
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_neon
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_neon
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_neon
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_neon
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_neon
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_neon
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_neon
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_neon
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_neon
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_neon
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_neon
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_neon
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_neon
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_neon
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_neon
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_neon
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_neon
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_neon
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_neon
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_neon
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_neon
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_neon
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_neon
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_neon
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_neon
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_neon
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_neon
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_neon
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_neon
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_neon
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_neon
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_neon
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_neon
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_neon
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_neon
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_neon
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_neon
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
@@ -416,17 +1323,20 @@ unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint
#define vpx_mse8x8 vpx_mse8x8_c
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b vpx_quantize_b_c
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_neon
void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad16x16 vpx_sad16x16_neon
unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_neon
void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad16x16x3 vpx_sad16x16x3_c
@@ -439,223 +1349,247 @@ void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref
#define vpx_sad16x16x8 vpx_sad16x16x8_c
unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad16x32 vpx_sad16x32_c
+unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_neon
unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_neon
void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_neon
unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad16x8 vpx_sad16x8_neon
unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_neon
void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad16x8x3 vpx_sad16x8x3_c
void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_neon
void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad16x8x8 vpx_sad16x8x8_c
unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x16 vpx_sad32x16_c
+unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_neon
unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_neon
void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_neon
unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x32 vpx_sad32x32_neon
unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
+unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_neon
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_neon
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x64 vpx_sad32x64_c
+unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_neon
unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_neon
void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_neon
unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad4x4 vpx_sad4x4_neon
unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_neon
void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad4x4x3 vpx_sad4x4x3_c
void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_neon
void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad4x4x8 vpx_sad4x4x8_c
unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad4x8 vpx_sad4x8_c
+unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_neon
unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_neon
void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_neon
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad64x32 vpx_sad64x32_c
+unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_neon
unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_neon
void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_neon
unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x64 vpx_sad64x64_neon
unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
+unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_neon
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_neon
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_neon
unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_neon
void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x16x3 vpx_sad8x16x3_c
void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_neon
void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x16x8 vpx_sad8x16x8_c
unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad8x4 vpx_sad8x4_c
+unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_neon
unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_neon
void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_neon
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_neon
unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_neon
void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x3 vpx_sad8x8x3_c
void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_neon
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_neon(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_neon(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_neon
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_neon
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon
uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon
uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon
uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon
uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon
uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon
uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon
uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon
uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon
uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon
uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon
uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon
uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -682,10 +1616,12 @@ uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_str
#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon
uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon
uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
diff --git a/config/arm-neon/vpx_scale_rtcd.h b/config/arm-neon/vpx_scale_rtcd.h
index a1564b7ad..b37136827 100644
--- a/config/arm-neon/vpx_scale_rtcd.h
+++ b/config/arm-neon/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/arm-neon/vpx_version.h
+++ b/config/arm-neon/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/arm/vp8_rtcd.h b/config/arm/vp8_rtcd.h
index e089d058d..188b1d7a2 100644
--- a/config/arm/vp8_rtcd.h
+++ b/config/arm/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/arm/vp9_rtcd.h b/config/arm/vp9_rtcd.h
index 6d67ad8bc..8cb5870c0 100644
--- a/config/arm/vp9_rtcd.h
+++ b/config/arm/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -33,7 +34,7 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_c
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#include "vpx_config.h"
diff --git a/config/arm/vpx_config.asm b/config/arm/vpx_config.asm
index 022dfa9b8..53f2e8535 100644
--- a/config/arm/vpx_config.asm
+++ b/config/arm/vpx_config.asm
@@ -20,7 +20,9 @@
.equ HAVE_SSE4_1 , 0
.equ HAVE_AVX , 0
.equ HAVE_AVX2 , 0
+.equ HAVE_AVX512 , 0
.equ HAVE_VSX , 0
+.equ HAVE_MMI , 0
.equ HAVE_VPX_PORTS , 1
.equ HAVE_PTHREAD_H , 1
.equ HAVE_UNISTD_H , 1
@@ -74,10 +76,11 @@
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
-.equ CONFIG_VP9_HIGHBITDEPTH , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 1
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 1
+.equ CONFIG_ALWAYS_ADJUST_BPM , 0
.equ CONFIG_SPATIAL_SVC , 0
.equ CONFIG_FP_MB_STATS , 0
.equ CONFIG_EMULATE_HARDWARE , 0
diff --git a/config/arm/vpx_config.c b/config/arm/vpx_config.c
index 7bc1805f6..1bc63e4f0 100644
--- a/config/arm/vpx_config.c
+++ b/config/arm/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=armv7-linux-gcc --disable-neon --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=armv7-linux-gcc --disable-neon --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/arm/vpx_config.h b/config/arm/vpx_config.h
index ddd914e4b..039717798 100644
--- a/config/arm/vpx_config.h
+++ b/config/arm/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/arm/vpx_dsp_rtcd.h b/config/arm/vpx_dsp_rtcd.h
index 51b423f20..25ee2a9dd 100644
--- a/config/arm/vpx_dsp_rtcd.h
+++ b/config/arm/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_c
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_c
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_c
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_c
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_c
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_c
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_c
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_c
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_c
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_c
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_c
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_c
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_c
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_c
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_2d vpx_scaled_2d_c
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/arm/vpx_scale_rtcd.h b/config/arm/vpx_scale_rtcd.h
index a1564b7ad..b37136827 100644
--- a/config/arm/vpx_scale_rtcd.h
+++ b/config/arm/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/arm/vpx_version.h b/config/arm/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/arm/vpx_version.h
+++ b/config/arm/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/arm64/vp8_rtcd.h b/config/arm64/vp8_rtcd.h
index 3f112f6f7..4eb59c663 100644
--- a/config/arm64/vp8_rtcd.h
+++ b/config/arm64/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h
index 1df16205a..0f4f04d1f 100644
--- a/config/arm64/vp9_rtcd.h
+++ b/config/arm64/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -33,9 +34,8 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_neon
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -53,35 +53,62 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_c
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_neon
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon
void vp9_rtcd(void);
diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm
index 7c69642cc..e3e1529bc 100644
--- a/config/arm64/vpx_config.asm
+++ b/config/arm64/vpx_config.asm
@@ -20,7 +20,9 @@
.equ HAVE_SSE4_1 , 0
.equ HAVE_AVX , 0
.equ HAVE_AVX2 , 0
+.equ HAVE_AVX512 , 0
.equ HAVE_VSX , 0
+.equ HAVE_MMI , 0
.equ HAVE_VPX_PORTS , 1
.equ HAVE_PTHREAD_H , 1
.equ HAVE_UNISTD_H , 1
@@ -74,10 +76,11 @@
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
-.equ CONFIG_VP9_HIGHBITDEPTH , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 1
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 1
+.equ CONFIG_ALWAYS_ADJUST_BPM , 0
.equ CONFIG_SPATIAL_SVC , 0
.equ CONFIG_FP_MB_STATS , 0
.equ CONFIG_EMULATE_HARDWARE , 0
diff --git a/config/arm64/vpx_config.c b/config/arm64/vpx_config.c
index ff9121723..13490c81c 100644
--- a/config/arm64/vpx_config.c
+++ b/config/arm64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/arm64/vpx_config.h b/config/arm64/vpx_config.h
index f1acc55cc..5304f8a57 100644
--- a/config/arm64/vpx_config.h
+++ b/config/arm64/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
index a915afabf..d911fd37f 100644
--- a/config/arm64/vpx_dsp_rtcd.h
+++ b/config/arm64/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -28,38 +30,39 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p);
#define vpx_avg_8x8 vpx_avg_8x8_neon
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_neon
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_neon
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_neon
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_neon
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_neon
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_neon
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_neon
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -213,26 +216,32 @@ void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8
#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_c
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_neon
void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon
void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_c
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_neon
void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon
void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4 vpx_fdct4x4_neon
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon
void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -273,17 +282,915 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_neon
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_neon
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_neon
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_neon
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_neon
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_neon
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_neon
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_neon
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_neon
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_neon
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_neon
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_neon
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_neon
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_neon
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_neon
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_neon
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_neon
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_neon
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_neon
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_neon
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_neon
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_neon
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_neon
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_neon
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_neon
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_neon
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_neon
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_neon
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_neon
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_neon
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_neon
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_neon
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_neon
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_neon
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_neon
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_neon
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_neon
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_neon
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_neon
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_neon
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_neon
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_neon
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_neon
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_neon
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_neon
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_neon
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_neon
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_neon
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_neon
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_neon
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_neon
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_neon
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_neon
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_neon
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_neon
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_neon
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_neon
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_neon
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_neon
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_neon
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_neon
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_neon
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_neon
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
@@ -416,17 +1323,20 @@ unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint
#define vpx_mse8x8 vpx_mse8x8_c
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b vpx_quantize_b_c
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_neon
void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad16x16 vpx_sad16x16_neon
unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_neon
void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad16x16x3 vpx_sad16x16x3_c
@@ -439,223 +1349,247 @@ void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref
#define vpx_sad16x16x8 vpx_sad16x16x8_c
unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad16x32 vpx_sad16x32_c
+unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_neon
unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_neon
void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_neon
unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad16x8 vpx_sad16x8_neon
unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_neon
void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad16x8x3 vpx_sad16x8x3_c
void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_neon
void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad16x8x8 vpx_sad16x8x8_c
unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x16 vpx_sad32x16_c
+unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_neon
unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_neon
void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_neon
unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x32 vpx_sad32x32_neon
unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
+unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_neon
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_neon
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x64 vpx_sad32x64_c
+unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_neon
unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_neon
void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_neon
unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad4x4 vpx_sad4x4_neon
unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_neon
void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad4x4x3 vpx_sad4x4x3_c
void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_neon
void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad4x4x8 vpx_sad4x4x8_c
unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad4x8 vpx_sad4x8_c
+unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_neon
unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_neon
void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_neon
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad64x32 vpx_sad64x32_c
+unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_neon
unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_neon
void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_neon
unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x64 vpx_sad64x64_neon
unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
+unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_neon
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_neon
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_neon
unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_neon
void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x16x3 vpx_sad8x16x3_c
void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_neon
void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x16x8 vpx_sad8x16x8_c
unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad8x4 vpx_sad8x4_c
+unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_neon
unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_neon
void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_neon
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_neon
unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_neon
void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x3 vpx_sad8x8x3_c
void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_neon
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_neon(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_neon(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_neon
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_neon
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon
uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon
uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon
uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon
uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon
uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon
uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon
uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon
uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon
uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon
uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon
uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon
uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -682,10 +1616,12 @@ uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_str
#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon
uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon
uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
diff --git a/config/arm64/vpx_scale_rtcd.h b/config/arm64/vpx_scale_rtcd.h
index a1564b7ad..b37136827 100644
--- a/config/arm64/vpx_scale_rtcd.h
+++ b/config/arm64/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/arm64/vpx_version.h
+++ b/config/arm64/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/generic/vp8_rtcd.h b/config/generic/vp8_rtcd.h
index 1e0ff8a7e..bc3ebe8a1 100644
--- a/config/generic/vp8_rtcd.h
+++ b/config/generic/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/generic/vp9_rtcd.h b/config/generic/vp9_rtcd.h
index 7d0a9e2ba..45a371c2f 100644
--- a/config/generic/vp9_rtcd.h
+++ b/config/generic/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -33,7 +34,7 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_c
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#include "vpx_config.h"
diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm
index 5d3ae3dfc..6b173b661 100644
--- a/config/generic/vpx_config.asm
+++ b/config/generic/vpx_config.asm
@@ -20,7 +20,9 @@
.equ HAVE_SSE4_1 , 0
.equ HAVE_AVX , 0
.equ HAVE_AVX2 , 0
+.equ HAVE_AVX512 , 0
.equ HAVE_VSX , 0
+.equ HAVE_MMI , 0
.equ HAVE_VPX_PORTS , 1
.equ HAVE_PTHREAD_H , 1
.equ HAVE_UNISTD_H , 1
@@ -74,10 +76,11 @@
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
-.equ CONFIG_VP9_HIGHBITDEPTH , 0
+.equ CONFIG_VP9_HIGHBITDEPTH , 1
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
.equ CONFIG_EXPERIMENTAL , 0
.equ CONFIG_SIZE_LIMIT , 1
+.equ CONFIG_ALWAYS_ADJUST_BPM , 0
.equ CONFIG_SPATIAL_SVC , 0
.equ CONFIG_FP_MB_STATS , 0
.equ CONFIG_EMULATE_HARDWARE , 0
diff --git a/config/generic/vpx_config.c b/config/generic/vpx_config.c
index c6d3e14c5..70fcdf7e3 100644
--- a/config/generic/vpx_config.c
+++ b/config/generic/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=generic-gnu --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=generic-gnu --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/generic/vpx_config.h b/config/generic/vpx_config.h
index 63c75e4f9..dc96f8743 100644
--- a/config/generic/vpx_config.h
+++ b/config/generic/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h
index ae0cea137..be38303bf 100644
--- a/config/generic/vpx_dsp_rtcd.h
+++ b/config/generic/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_c
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_c
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_c
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_c
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_c
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_c
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_c
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_c
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_c
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_c
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_c
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_c
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_c
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_c
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_2d vpx_scaled_2d_c
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/generic/vpx_scale_rtcd.h b/config/generic/vpx_scale_rtcd.h
index f419cc7a5..d12f52764 100644
--- a/config/generic/vpx_scale_rtcd.h
+++ b/config/generic/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/generic/vpx_version.h
+++ b/config/generic/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/mips32-dspr2/vp8_rtcd.h b/config/mips32-dspr2/vp8_rtcd.h
index a940d3594..e24387399 100644
--- a/config/mips32-dspr2/vp8_rtcd.h
+++ b/config/mips32-dspr2/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/mips32-dspr2/vp9_rtcd.h b/config/mips32-dspr2/vp9_rtcd.h
index 2e161c181..91d3a1aab 100644
--- a/config/mips32-dspr2/vp9_rtcd.h
+++ b/config/mips32-dspr2/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -33,7 +34,7 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,23 +52,50 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_c
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_dspr2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_dspr2
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_dspr2
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_dspr2
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_c
@@ -78,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#include "vpx_config.h"
diff --git a/config/mips32-dspr2/vpx_config.c b/config/mips32-dspr2/vpx_config.c
index 1aa002457..0471682f6 100644
--- a/config/mips32-dspr2/vpx_config.c
+++ b/config/mips32-dspr2/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips32-linux-gcc --enable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips32-linux-gcc --enable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips32-dspr2/vpx_config.h b/config/mips32-dspr2/vpx_config.h
index 6df484f60..9bdc19616 100644
--- a/config/mips32-dspr2/vpx_config.h
+++ b/config/mips32-dspr2/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips32-dspr2/vpx_dsp_rtcd.h b/config/mips32-dspr2/vpx_dsp_rtcd.h
index cdb0cfc6e..bd4acd0ff 100644
--- a/config/mips32-dspr2/vpx_dsp_rtcd.h
+++ b/config/mips32-dspr2/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -28,36 +30,36 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_c
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_dspr2
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_dspr2
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_dspr2
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_dspr2
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_dspr2
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_dspr2
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_dspr2
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_dspr2
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -243,66 +245,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_dspr2
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_10_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_10_add vpx_idct16x16_10_add_dspr2
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_1_add vpx_idct16x16_1_add_dspr2
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_dspr2
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_dspr2
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_dspr2
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_dspr2
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1_add vpx_idct32x32_1_add_dspr2
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_34_add vpx_idct32x32_34_add_dspr2
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_16_add vpx_idct4x4_16_add_dspr2
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_1_add vpx_idct4x4_1_add_dspr2
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_12_add vpx_idct8x8_12_add_dspr2
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_1_add vpx_idct8x8_1_add_dspr2
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_dspr2
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
#define vpx_int_pro_col vpx_int_pro_col_c
@@ -439,15 +1256,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_c
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_c
@@ -481,9 +1292,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_c
@@ -499,15 +1307,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_c
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_c
@@ -532,9 +1334,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_c
@@ -550,25 +1349,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_c
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_2d vpx_scaled_2d_c
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips32-dspr2/vpx_scale_rtcd.h b/config/mips32-dspr2/vpx_scale_rtcd.h
index 15b1b5a6f..487bc29b8 100644
--- a/config/mips32-dspr2/vpx_scale_rtcd.h
+++ b/config/mips32-dspr2/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -48,6 +49,9 @@ void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_dspr2(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_dspr2
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/mips32-dspr2/vpx_version.h b/config/mips32-dspr2/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips32-dspr2/vpx_version.h
+++ b/config/mips32-dspr2/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/mips32-msa/vp8_rtcd.h b/config/mips32-msa/vp8_rtcd.h
index a851d7f13..00469b064 100644
--- a/config/mips32-msa/vp8_rtcd.h
+++ b/config/mips32-msa/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/mips32-msa/vp9_rtcd.h b/config/mips32-msa/vp9_rtcd.h
index d0adf351e..91d3a1aab 100644
--- a/config/mips32-msa/vp9_rtcd.h
+++ b/config/mips32-msa/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -31,10 +32,9 @@ extern "C" {
#endif
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_msa
+#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -44,35 +44,58 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr
#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_msa
+#define vp9_fht16x16 vp9_fht16x16_c
void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_msa
+#define vp9_fht4x4 vp9_fht4x4_c
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_msa
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
+#define vp9_fht8x8 vp9_fht8x8_c
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_msa
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_c
@@ -83,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#include "vpx_config.h"
diff --git a/config/mips32-msa/vpx_config.c b/config/mips32-msa/vpx_config.c
index 21f7c3b82..737e5530b 100644
--- a/config/mips32-msa/vpx_config.c
+++ b/config/mips32-msa/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips32-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips32-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips32-msa/vpx_config.h b/config/mips32-msa/vpx_config.h
index 53831030d..9ca17a018 100644
--- a/config/mips32-msa/vpx_config.h
+++ b/config/mips32-msa/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips32-msa/vpx_dsp_rtcd.h b/config/mips32-msa/vpx_dsp_rtcd.h
index 22c63bfbc..4558d6960 100644
--- a/config/mips32-msa/vpx_dsp_rtcd.h
+++ b/config/mips32-msa/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -30,36 +32,36 @@ unsigned int vpx_avg_8x8_msa(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_c
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_msa
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_msa
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_msa
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_msa
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_msa
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_msa
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -205,35 +207,28 @@ void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_
#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa
void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_msa
+#define vpx_fdct16x16 vpx_fdct16x16_c
void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_msa
+#define vpx_fdct32x32 vpx_fdct32x32_c
void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_msa
+#define vpx_fdct4x4 vpx_fdct4x4_c
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct8x8 vpx_fdct8x8_msa
+#define vpx_fdct8x8 vpx_fdct8x8_c
void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride);
@@ -271,68 +266,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width);
@@ -343,12 +1151,10 @@ void vpx_int_pro_row_msa(int16_t *hbuf, const uint8_t *ref, const int ref_stride
#define vpx_int_pro_row vpx_int_pro_row_msa
void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -496,18 +1302,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_msa
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_msa
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_msa
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_msa
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_msa
@@ -552,10 +1350,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_msa
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_msa
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_msa
@@ -576,18 +1370,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_msa
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_msa
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_msa
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_msa
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_msa
@@ -620,10 +1406,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_msa
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_msa
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_msa
@@ -644,26 +1426,26 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
void vpx_sad8x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_msa
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_msa(const int16_t *coeff, int length);
-#define vpx_satd vpx_satd_msa
+int vpx_satd_c(const tran_low_t *coeff, int length);
+#define vpx_satd vpx_satd_c
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_msa
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips32-msa/vpx_scale_rtcd.h b/config/mips32-msa/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips32-msa/vpx_scale_rtcd.h
+++ b/config/mips32-msa/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/mips32-msa/vpx_version.h b/config/mips32-msa/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips32-msa/vpx_version.h
+++ b/config/mips32-msa/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/mips32/vp8_rtcd.h b/config/mips32/vp8_rtcd.h
index 21dfa5a25..fbd444b8a 100644
--- a/config/mips32/vp8_rtcd.h
+++ b/config/mips32/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/mips32/vp9_rtcd.h b/config/mips32/vp9_rtcd.h
index c17a21721..91d3a1aab 100644
--- a/config/mips32/vp9_rtcd.h
+++ b/config/mips32/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -33,7 +34,7 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_c
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#include "vpx_config.h"
diff --git a/config/mips32/vpx_config.c b/config/mips32/vpx_config.c
index e2703b374..f66993f87 100644
--- a/config/mips32/vpx_config.c
+++ b/config/mips32/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips32-linux-gcc --disable-dspr2 --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips32-linux-gcc --disable-dspr2 --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips32/vpx_config.h b/config/mips32/vpx_config.h
index beaa2f86c..d3ecfe8e7 100644
--- a/config/mips32/vpx_config.h
+++ b/config/mips32/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips32/vpx_dsp_rtcd.h b/config/mips32/vpx_dsp_rtcd.h
index 1b15aadba..fbb38953d 100644
--- a/config/mips32/vpx_dsp_rtcd.h
+++ b/config/mips32/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_c
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_c
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_c
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_c
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_c
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_c
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_c
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_c
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_c
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_c
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_c
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_c
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_c
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_c
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_2d vpx_scaled_2d_c
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips32/vpx_scale_rtcd.h b/config/mips32/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips32/vpx_scale_rtcd.h
+++ b/config/mips32/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/mips32/vpx_version.h b/config/mips32/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips32/vpx_version.h
+++ b/config/mips32/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/mips64-msa/vp8_rtcd.h b/config/mips64-msa/vp8_rtcd.h
index a851d7f13..00469b064 100644
--- a/config/mips64-msa/vp8_rtcd.h
+++ b/config/mips64-msa/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/mips64-msa/vp9_rtcd.h b/config/mips64-msa/vp9_rtcd.h
index d0adf351e..91d3a1aab 100644
--- a/config/mips64-msa/vp9_rtcd.h
+++ b/config/mips64-msa/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -31,10 +32,9 @@ extern "C" {
#endif
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_msa
+#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -44,35 +44,58 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr
#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_msa
+#define vp9_fht16x16 vp9_fht16x16_c
void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_msa
+#define vp9_fht4x4 vp9_fht4x4_c
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_msa
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
+#define vp9_fht8x8 vp9_fht8x8_c
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_msa
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_c
@@ -83,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#include "vpx_config.h"
diff --git a/config/mips64-msa/vpx_config.c b/config/mips64-msa/vpx_config.c
index c1ab4fb90..9665244e3 100644
--- a/config/mips64-msa/vpx_config.c
+++ b/config/mips64-msa/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips64-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips64-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips64-msa/vpx_config.h b/config/mips64-msa/vpx_config.h
index ea5e8bd43..df9ed4455 100644
--- a/config/mips64-msa/vpx_config.h
+++ b/config/mips64-msa/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips64-msa/vpx_dsp_rtcd.h b/config/mips64-msa/vpx_dsp_rtcd.h
index 22c63bfbc..4558d6960 100644
--- a/config/mips64-msa/vpx_dsp_rtcd.h
+++ b/config/mips64-msa/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -30,36 +32,36 @@ unsigned int vpx_avg_8x8_msa(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_c
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_msa
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_msa
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_msa
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_msa
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_msa
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_msa
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -205,35 +207,28 @@ void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_
#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa
void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_msa
+#define vpx_fdct16x16 vpx_fdct16x16_c
void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_msa
+#define vpx_fdct32x32 vpx_fdct32x32_c
void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_msa
+#define vpx_fdct4x4 vpx_fdct4x4_c
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct8x8 vpx_fdct8x8_msa
+#define vpx_fdct8x8 vpx_fdct8x8_c
void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride);
@@ -271,68 +266,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width);
@@ -343,12 +1151,10 @@ void vpx_int_pro_row_msa(int16_t *hbuf, const uint8_t *ref, const int ref_stride
#define vpx_int_pro_row vpx_int_pro_row_msa
void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -496,18 +1302,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_msa
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_msa
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_msa
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_msa
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_msa
@@ -552,10 +1350,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_msa
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_msa
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_msa
@@ -576,18 +1370,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_msa
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_msa
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_msa
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_msa
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_msa
@@ -620,10 +1406,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_msa
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_msa
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_msa
@@ -644,26 +1426,26 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
void vpx_sad8x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_msa
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_msa(const int16_t *coeff, int length);
-#define vpx_satd vpx_satd_msa
+int vpx_satd_c(const tran_low_t *coeff, int length);
+#define vpx_satd vpx_satd_c
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_msa
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips64-msa/vpx_scale_rtcd.h b/config/mips64-msa/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips64-msa/vpx_scale_rtcd.h
+++ b/config/mips64-msa/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/mips64-msa/vpx_version.h b/config/mips64-msa/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips64-msa/vpx_version.h
+++ b/config/mips64-msa/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/mips64/vp8_rtcd.h b/config/mips64/vp8_rtcd.h
index 21dfa5a25..fbd444b8a 100644
--- a/config/mips64/vp8_rtcd.h
+++ b/config/mips64/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/mips64/vp9_rtcd.h b/config/mips64/vp9_rtcd.h
index c17a21721..91d3a1aab 100644
--- a/config/mips64/vp9_rtcd.h
+++ b/config/mips64/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -33,7 +34,7 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_c
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_c
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_c
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#include "vpx_config.h"
diff --git a/config/mips64/vpx_config.c b/config/mips64/vpx_config.c
index f7ec4e6d8..5eb7dff03 100644
--- a/config/mips64/vpx_config.c
+++ b/config/mips64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips64-linux-gcc --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips64-linux-gcc --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips64/vpx_config.h b/config/mips64/vpx_config.h
index 9efd808ed..8e67ca3a7 100644
--- a/config/mips64/vpx_config.h
+++ b/config/mips64/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips64/vpx_dsp_rtcd.h b/config/mips64/vpx_dsp_rtcd.h
index 1b15aadba..fbb38953d 100644
--- a/config/mips64/vpx_dsp_rtcd.h
+++ b/config/mips64/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_c
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_c
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_c
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_c
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_c
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_c
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_c
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_c
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_c
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_c
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_c
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_c
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_c
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_c
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_2d vpx_scaled_2d_c
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips64/vpx_scale_rtcd.h b/config/mips64/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips64/vpx_scale_rtcd.h
+++ b/config/mips64/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/mips64/vpx_version.h b/config/mips64/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips64/vpx_version.h
+++ b/config/mips64/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h
index 77479a23b..3afbea668 100644
--- a/config/x86/vp8_rtcd.h
+++ b/config/x86/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h
index b19d6cc5a..49e9885aa 100644
--- a/config/x86/vp9_rtcd.h
+++ b/config/x86/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -34,15 +35,14 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in
int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_sse2
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_sse2
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3
@@ -58,14 +58,44 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_sse2
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sadx3
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_sse2
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_sse2
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
@@ -89,9 +119,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y
void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_ssse3
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#ifdef RTCD_C
diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm
index e9ff1c42f..d9848bace 100644
--- a/config/x86/vpx_config.asm
+++ b/config/x86/vpx_config.asm
@@ -17,7 +17,9 @@
%define HAVE_SSE4_1 0
%define HAVE_AVX 0
%define HAVE_AVX2 0
+%define HAVE_AVX512 0
%define HAVE_VSX 0
+%define HAVE_MMI 0
%define HAVE_VPX_PORTS 1
%define HAVE_PTHREAD_H 1
%define HAVE_UNISTD_H 1
@@ -71,10 +73,11 @@
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_VP9_HIGHBITDEPTH 0
+%define CONFIG_VP9_HIGHBITDEPTH 1
%define CONFIG_BETTER_HW_COMPATIBILITY 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 1
+%define CONFIG_ALWAYS_ADJUST_BPM 0
%define CONFIG_SPATIAL_SVC 0
%define CONFIG_FP_MB_STATS 0
%define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86/vpx_config.c b/config/x86/vpx_config.c
index 77a386493..2d3f0f735 100644
--- a/config/x86/vpx_config.c
+++ b/config/x86/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=x86-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/x86/vpx_config.h b/config/x86/vpx_config.h
index 11a5e94ad..5b0fed08f 100644
--- a/config/x86/vpx_config.h
+++ b/config/x86/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
index adc43df7b..69f2b43b5 100644
--- a/config/x86/vpx_dsp_rtcd.h
+++ b/config/x86/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -31,42 +33,42 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int
void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_ssse3
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_ssse3
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_ssse3
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_sse2
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_sse2
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -294,17 +296,1068 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_sse2
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_sse2
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_sse2
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_sse2
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_sse2
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_sse2
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_sse2
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_sse2
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_sse2
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_sse2
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_sse2
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_sse2
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_sse2
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_sse2
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_sse2
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_sse2
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_sse2
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_sse2
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_sse2
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_sse2
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_sse2
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_sse2
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_sse2
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_sse2
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_sse2
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_sse2
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_sse2
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_sse2
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_sse2
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_sse2
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_sse2
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_sse2
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_sse2
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_sse2
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_ssse3
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_ssse3
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_sse2
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_ssse3
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_ssse3
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_ssse3
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_sse2
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_ssse3
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_ssse3
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_ssse3
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_sse2
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_ssse3
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_ssse3
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_ssse3
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_sse2
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_ssse3
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_ssse3
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_ssse3
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_ssse3
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_ssse3
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_ssse3
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_ssse3
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_sse2
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_ssse3
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_sse2
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_sse2
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_sse2
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_sse2
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_sse2
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_sse2
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_sse2
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_sse2
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_sse2
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_sse2
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_sse2
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_sse2
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_sse2
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_sse2
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_sse2
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_sse2
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_sse2
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_sse2
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_sse2
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_sse2
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_sse2
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_sse2
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_sse2
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_sse2
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_sse2
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_sse2
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_sse2
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_sse2
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_sse2
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_sse2
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_sse2
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_sse2
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_sse2
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_sse2
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_sse2
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_sse2
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_sse2
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_sse2
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_sse2
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_sse2
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_sse2
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_sse2
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_sse2
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_sse2
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_sse2
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_sse2
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_sse2
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_sse2
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_sse2
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_sse2
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_sse2
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_sse2
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_sse2
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_sse2
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_sse2
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_sse2
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_sse2
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_sse2
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_sse2
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_sse2
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_sse2
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_sse2
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
@@ -318,16 +1371,15 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stri
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_sse2
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2
void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_ssse3
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_135_add vpx_idct32x32_135_add_ssse3
@@ -359,8 +1411,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_ssse3
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -463,10 +1514,12 @@ void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b vpx_quantize_b_sse2
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_ssse3
void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3
unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -540,16 +1593,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_sse2
@@ -593,9 +1640,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_sse2
@@ -616,16 +1660,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_sse2
@@ -657,9 +1695,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_sse2
@@ -679,27 +1714,27 @@ void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_sse2(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_sse2(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_sse2
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_2d vpx_scaled_2d_ssse3
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/x86/vpx_scale_rtcd.h b/config/x86/vpx_scale_rtcd.h
index ddf7d01cc..5f09104ea 100644
--- a/config/x86/vpx_scale_rtcd.h
+++ b/config/x86/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/x86/vpx_version.h
+++ b/config/x86/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h
index 77479a23b..3afbea668 100644
--- a/config/x86_64/vp8_rtcd.h
+++ b/config/x86_64/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP8_RTCD_H_
#define VP8_RTCD_H_
diff --git a/config/x86_64/vp9_rtcd.h b/config/x86_64/vp9_rtcd.h
index f77b2a5c2..2a13e5d5c 100644
--- a/config/x86_64/vp9_rtcd.h
+++ b/config/x86_64/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VP9_RTCD_H_
#define VP9_RTCD_H_
@@ -34,15 +35,14 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in
int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
#define vp9_block_error vp9_block_error_sse2
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
#define vp9_block_error_fp vp9_block_error_fp_sse2
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3
@@ -58,14 +58,44 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
#define vp9_fht8x8 vp9_fht8x8_sse2
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sadx3
-
void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_sse2
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_sse2
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
@@ -91,9 +121,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y
void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_ssse3
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
void vp9_rtcd(void);
#ifdef RTCD_C
diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm
index 798a81a1c..747981e62 100644
--- a/config/x86_64/vpx_config.asm
+++ b/config/x86_64/vpx_config.asm
@@ -17,7 +17,9 @@
%define HAVE_SSE4_1 0
%define HAVE_AVX 0
%define HAVE_AVX2 0
+%define HAVE_AVX512 0
%define HAVE_VSX 0
+%define HAVE_MMI 0
%define HAVE_VPX_PORTS 1
%define HAVE_PTHREAD_H 1
%define HAVE_UNISTD_H 1
@@ -71,10 +73,11 @@
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_VP9_HIGHBITDEPTH 0
+%define CONFIG_VP9_HIGHBITDEPTH 1
%define CONFIG_BETTER_HW_COMPATIBILITY 0
%define CONFIG_EXPERIMENTAL 0
%define CONFIG_SIZE_LIMIT 1
+%define CONFIG_ALWAYS_ADJUST_BPM 0
%define CONFIG_SPATIAL_SVC 0
%define CONFIG_FP_MB_STATS 0
%define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86_64/vpx_config.c b/config/x86_64/vpx_config.c
index 9aa0640aa..a13a1d2e2 100644
--- a/config/x86_64/vpx_config.c
+++ b/config/x86_64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=x86_64-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/x86_64/vpx_config.h b/config/x86_64/vpx_config.h
index d24e047ba..75d7e9900 100644
--- a/config/x86_64/vpx_config.h
+++ b/config/x86_64/vpx_config.h
@@ -29,7 +29,9 @@
#define HAVE_SSE4_1 0
#define HAVE_AVX 0
#define HAVE_AVX2 0
+#define HAVE_AVX512 0
#define HAVE_VSX 0
+#define HAVE_MMI 0
#define HAVE_VPX_PORTS 1
#define HAVE_PTHREAD_H 1
#define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
#define CONFIG_BETTER_HW_COMPATIBILITY 0
#define CONFIG_EXPERIMENTAL 0
#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
#define CONFIG_SPATIAL_SVC 0
#define CONFIG_FP_MB_STATS 0
#define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
index 4e55439ca..a382a5a05 100644
--- a/config/x86_64/vpx_dsp_rtcd.h
+++ b/config/x86_64/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_DSP_RTCD_H_
#define VPX_DSP_RTCD_H_
@@ -13,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
@@ -31,42 +33,42 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int
void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8 vpx_convolve8_ssse3
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg vpx_convolve8_avg_ssse3
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve8_vert vpx_convolve8_vert_ssse3
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_avg vpx_convolve_avg_sse2
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_convolve_copy vpx_convolve_copy_sse2
void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -257,8 +259,7 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct8x8_ssse3(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct8x8 vpx_fdct8x8_ssse3
+#define vpx_fdct8x8 vpx_fdct8x8_sse2
void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
@@ -295,18 +296,1075 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
#define vpx_hadamard_8x8 vpx_hadamard_8x8_ssse3
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_sse2
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_sse2
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_sse2
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_sse2
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_sse2
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_sse2
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_sse2
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_sse2
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_sse2
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_sse2
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_sse2
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_sse2
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_sse2
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_sse2
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_sse2
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_sse2
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_sse2
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_sse2
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_sse2
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_sse2
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_sse2
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_sse2
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_sse2
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_sse2
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_sse2
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_sse2
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_sse2
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_sse2
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_sse2
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_sse2
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_sse2
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_sse2
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_sse2
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_ssse3
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_ssse3
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_sse2
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_ssse3
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_ssse3
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_ssse3
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_sse2
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_ssse3
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_ssse3
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_ssse3
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_sse2
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_ssse3
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_ssse3
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_ssse3
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_sse2
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_ssse3
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_ssse3
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_ssse3
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_ssse3
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_ssse3
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_ssse3
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_ssse3
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_sse2
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_ssse3
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_sse2
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_sse2
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_sse2
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_sse2
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_sse2
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_sse2
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_sse2
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_sse2
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_sse2
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_sse2
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_sse2
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_sse2
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_sse2
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_sse2
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_sse2
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_sse2
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_sse2
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_sse2
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_sse2
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_sse2
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_sse2
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_sse2
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_sse2
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_sse2
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_sse2
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_sse2
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_sse2
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_sse2
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_sse2
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_sse2
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_sse2
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_sse2
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_sse2
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_sse2
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_sse2
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_sse2
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_sse2
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_sse2
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_sse2
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_sse2
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_sse2
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_sse2
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_sse2
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_sse2
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_sse2
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_sse2
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_sse2
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_sse2
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_sse2
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_sse2
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_sse2
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_sse2
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_sse2
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_sse2
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_sse2
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_sse2
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_sse2
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_sse2
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_sse2
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_sse2
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_sse2
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_sse2
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
+
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
@@ -320,16 +1378,15 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stri
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_sse2
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2
void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_ssse3
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_135_add vpx_idct32x32_135_add_ssse3
@@ -361,8 +1418,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_ssse3
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -544,16 +1600,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad32x64 vpx_sad32x64_sse2
@@ -597,9 +1647,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad64x32 vpx_sad64x32_sse2
@@ -620,16 +1667,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x16 vpx_sad8x16_sse2
@@ -661,9 +1702,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
#define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define vpx_sad8x8 vpx_sad8x8_sse2
@@ -683,27 +1721,27 @@ void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *
void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
#define vpx_sad8x8x8 vpx_sad8x8x8_c
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_sse2(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_sse2(const tran_low_t *coeff, int length);
#define vpx_satd vpx_satd_sse2
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_2d vpx_scaled_2d_ssse3
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_horiz vpx_scaled_horiz_c
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define vpx_scaled_vert vpx_scaled_vert_c
uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/x86_64/vpx_scale_rtcd.h b/config/x86_64/vpx_scale_rtcd.h
index ddf7d01cc..5f09104ea 100644
--- a/config/x86_64/vpx_scale_rtcd.h
+++ b/config/x86_64/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
#ifndef VPX_SCALE_RTCD_H_
#define VPX_SCALE_RTCD_H_
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
#define vpx_yv12_copy_y vpx_yv12_copy_y_c
diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/x86_64/vpx_version.h
+++ b/config/x86_64/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
#define VERSION_MAJOR 1
-#define VERSION_MINOR 6
-#define VERSION_PATCH 1
+#define VERSION_MINOR 7
+#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING " v1.7.0"
diff --git a/generate_config.sh b/generate_config.sh
index 97e5a0026..351feb87c 100755
--- a/generate_config.sh
+++ b/generate_config.sh
@@ -139,23 +139,76 @@ function gen_config_files {
rm -rf vpx_config.* vpx_version.h
}
+# Generate a text file containing sources for a config
+# $1 - Config
+function gen_source_list {
+ make_clean
+ if [[ "$1" = "mips"* ]] || [[ "$1" = "generic" ]]; then
+ config=$(print_config_basic $1)
+ else
+ config=$(print_config $1)
+ fi
+ make libvpx_srcs.txt target=libs $config > /dev/null
+ mv libvpx_srcs.txt libvpx_srcs_$1.txt
+}
+
+# Extract a list of C sources from a libvpx_srcs.txt file
+# $1 - path to libvpx_srcs.txt
+function libvpx_srcs_txt_to_c_srcs {
+ grep ".c$" $1 | grep -v "^vpx_config.c$" | awk '$0="\"libvpx/"$0"\","' | sort
+}
+
+# Extract a list of ASM sources from a libvpx_srcs.txt file
+# $1 - path to libvpx_srcs.txt
+function libvpx_srcs_txt_to_asm_srcs {
+ grep ".asm$" $1 | awk '$0="\"libvpx/"$0"\","' | sort
+}
+
# Convert a list of sources to a blueprint file containing a variable
# assignment.
-# $1 - Variable name prefix.
-# $2 - Input file.
-# $3 - Config directory.
+# $1 - Config
function gen_bp_srcs {
- echo "$1_c_srcs = ["
- grep ".c$" $2 | grep -v "^vpx_config.c$" | awk '$0="\"libvpx/"$0"\","'
- echo "\"$3/vpx_config.c\","
- echo "]"
- if grep -q ".asm$" $2; then
+ (
+ varprefix=libvpx_${1//-/_}
+ echo "${varprefix}_c_srcs = ["
+ libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt
+ echo "\"$LIBVPX_CONFIG_DIR/$1/vpx_config.c\","
+ echo "]"
+ if grep -q ".asm$" libvpx_srcs_$1.txt; then
+ echo
+ echo "${varprefix}_asm_srcs = ["
+ libvpx_srcs_txt_to_asm_srcs libvpx_srcs_$1.txt
+ echo "]"
+ fi
echo
- echo "$1_asm_srcs = ["
- grep ".asm$" $2 | awk '$0="\"libvpx/"$0"\","'
+ ) > config_$1.bp
+}
+
+# Convert a list of sources to a blueprint file containing a variable
+# assignment, relative to a reference config.
+# $1 - Config
+# $2 - Reference config
+function gen_bp_srcs_with_excludes {
+ (
+ varprefix=libvpx_${1//-/_}
+ echo "${varprefix}_c_srcs = ["
+ comm -23 <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt) <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$2.txt)
+ echo "\"$LIBVPX_CONFIG_DIR/$1/vpx_config.c\","
echo "]"
- fi
- echo
+ echo
+ echo "${varprefix}_exclude_c_srcs = ["
+ comm -13 <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt) <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$2.txt)
+ echo "\"$LIBVPX_CONFIG_DIR/$2/vpx_config.c\","
+ echo "]"
+ echo
+ if grep -q ".asm$" libvpx_srcs_$1.txt; then
+ echo
+ echo "${varprefix}_asm_srcs = ["
+ libvpx_srcs_txt_to_asm_srcs libvpx_srcs_$1.txt
+ echo "]"
+ fi
+ echo
+ ) > config_$1.bp
}
echo "Create temporary directory."
@@ -165,8 +218,10 @@ cp -R $LIBVPX_SRC_DIR $TEMP_DIR
cd $TEMP_DIR
echo "Generate config files."
-all_platforms="--enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"
-intel="--disable-sse4_1 --disable-avx --disable-avx2 --as=yasm"
+all_platforms="--enable-external-build --enable-realtime-only --enable-pic"
+all_platforms+=" --disable-runtime-cpu-detect --disable-install-docs"
+all_platforms+=" --size-limit=4096x3072 --enable-vp9-highbitdepth"
+intel="--disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm"
gen_config_files x86 "--target=x86-linux-gcc ${intel} ${all_platforms}"
gen_config_files x86_64 "--target=x86_64-linux-gcc ${intel} ${all_platforms}"
gen_config_files arm "--target=armv7-linux-gcc --disable-neon ${all_platforms}"
@@ -218,71 +273,31 @@ echo "Prepare Makefile."
./configure --target=generic-gnu > /dev/null
make_clean
-echo "Generate X86 source list."
-config=$(print_config x86)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_x86 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/x86 > config_x86.bp
-
-echo "Generate X86_64 source list."
-config=$(print_config x86_64)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_x86_64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/x86_64 > config_x86_64.bp
-
-echo "Generate ARM source list."
-config=$(print_config arm)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_arm libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm > config_arm.bp
-
-echo "Generate ARM NEON source list."
-config=$(print_config arm-neon)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_arm_neon libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm-neon > config_arm-neon.bp
-
-echo "Generate ARM64 source list."
-config=$(print_config arm64)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_arm64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm64 > config_arm64.bp
-
-echo "Generate MIPS source list."
-config=$(print_config_basic mips32)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips32 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32 > config_mips32.bp
-
-echo "Generate MIPS DSPR2 source list."
-config=$(print_config_basic mips32-dspr2)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips32_dspr2 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32-dspr2 > config_mips32-dispr2.bp
-
-echo "Generate MIPS MSA source list."
-config=$(print_config_basic mips32-msa)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips32_msa libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32-msa > config_mips32-msa.bp
-
-echo "Generate MIPS64 source list."
-config=$(print_config_basic mips64)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips64 > config_mips64.bp
-
-echo "Generate MIPS64 MSA source list."
-config=$(print_config_basic mips64-msa)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips64_msa libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips64-msa > config_mips64-msa.bp
-
-echo "Generate GENERIC source list."
-config=$(print_config_basic generic)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_generic libvpx_srcs.txt $LIBVPX_CONFIG_DIR/generic > config_generic.bp
+echo "Generate source lists"
+gen_source_list x86
+gen_source_list x86_64
+gen_source_list arm
+gen_source_list arm-neon
+gen_source_list arm64
+gen_source_list mips32
+gen_source_list mips32-dspr2
+gen_source_list mips32-msa
+gen_source_list mips64
+gen_source_list mips64-msa
+gen_source_list generic
+
+echo "Convert to bp"
+gen_bp_srcs x86
+gen_bp_srcs x86_64
+gen_bp_srcs arm
+gen_bp_srcs_with_excludes arm-neon arm
+gen_bp_srcs arm64
+gen_bp_srcs mips32
+gen_bp_srcs_with_excludes mips32-dspr2 mips32
+gen_bp_srcs_with_excludes mips32-msa mips32
+gen_bp_srcs mips64
+gen_bp_srcs_with_excludes mips64-msa mips64
+gen_bp_srcs generic
rm -f $BASE_DIR/Android.bp
(
diff --git a/libvpx/.clang-format b/libvpx/.clang-format
index 7837b7704..c1483199e 100644
--- a/libvpx/.clang-format
+++ b/libvpx/.clang-format
@@ -1,7 +1,7 @@
---
Language: Cpp
# BasedOnStyle: Google
-# Generated with clang-format 3.9.1
+# Generated with clang-format 4.0.1
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
@@ -60,6 +60,8 @@ IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
@@ -78,6 +80,7 @@ PointerAlignment: Right
ReflowComments: true
SortIncludes: false
SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
diff --git a/libvpx/.mailmap b/libvpx/.mailmap
index 166c45ee8..29af51065 100644
--- a/libvpx/.mailmap
+++ b/libvpx/.mailmap
@@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com>
Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Chris Cunningham <chcunningham@chromium.org>
Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
Deb Mukherjee <debargha@google.com>
Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
@@ -21,18 +22,21 @@ Marco Paniconi <marpan@google.com>
Marco Paniconi <marpan@google.com> <marpan@chromium.org>
Pascal Massimino <pascal.massimino@gmail.com>
Paul Wilkins <paulwilkins@google.com>
+Peter Boström <pbos@chromium.org> <pbos@google.com>
Peter de Rivaz <peter.derivaz@gmail.com>
Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
Ralph Giles <giles@xiph.org> <giles@entropywave.com>
Ralph Giles <giles@xiph.org> <giles@mozilla.com>
Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
Sami Pietilä <samipietila@google.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
Tamar Levy <tamar.levy@intel.com>
Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
Tom Finegan <tomfinegan@google.com>
Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Urvang Joshi <urvang@google.com> <urvang@chromium.org>
Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
Yaowu Xu <yaowu@google.com> <Yaowu Xu>
diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS
index 87a5e845c..04c287243 100644
--- a/libvpx/AUTHORS
+++ b/libvpx/AUTHORS
@@ -3,13 +3,13 @@
Aaron Watry <awatry@gmail.com>
Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
Adrian Grange <agrange@google.com>
Aℓex Converse <aconverse@google.com>
Ahmad Sharif <asharif@google.com>
Aleksey Vasenev <margtu-fivt@ya.ru>
Alexander Potapenko <glider@google.com>
Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexandra Hájková <alexandra.khirnova@gmail.com>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Alpha Lam <hclam@google.com>
@@ -17,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
Ami Fischman <fischman@chromium.org>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
+Andrew Lewis <andrewlewis@google.com>
Andrew Russell <anrussell@google.com>
Angie Chiang <angiebird@google.com>
Aron Rosenberg <arosenberg@logitech.com>
@@ -24,7 +25,9 @@ Attila Nagy <attilanagy@google.com>
Brion Vibber <bvibber@wikimedia.org>
changjun.yang <changjun.yang@intel.com>
Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
chm <chm@rock-chips.com>
+Chris Cunningham <chcunningham@chromium.org>
Christian Duvivier <cduvivier@google.com>
Daniele Castagna <dcastagna@chromium.org>
Daniel Kang <ddkang@google.com>
@@ -46,10 +49,12 @@ Geza Lore <gezalore@gmail.com>
Ghislain MARY <ghislainmary2@gmail.com>
Giuseppe Scrivano <gscrivano@gnu.org>
Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Gregor Jasny <gjasny@gmail.com>
Guillaume Martres <gmartres@google.com>
Guillermo Ballester Valor <gbvalor@gmail.com>
Hangyu Kuang <hkuang@google.com>
Hanno Böck <hanno@hboeck.de>
+Han Shen <shenhan@google.com>
Henrik Lundin <hlundin@google.com>
Hui Su <huisu@google.com>
Ivan Krasin <krasin@chromium.org>
@@ -83,6 +88,7 @@ Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
Kaustubh Raste <kaustubh.raste@imgtec.com>
KO Myung-Hun <komh@chollian.net>
+Kyle Siefring <kylesiefring@gmail.com>
Lawrence Velázquez <larryv@macports.org>
Linfeng Zhang <linfengz@google.com>
Lou Quillio <louquillio@google.com>
@@ -101,6 +107,7 @@ Mikhal Shemer <mikhal@google.com>
Min Chen <chenm003@gmail.com>
Minghai Shang <minghai@google.com>
Min Ye <yeemmi@google.com>
+Moriyoshi Koizumi <mozo@mozo.jp>
Morton Jonuschat <yabawock@gmail.com>
Nathan E. Egge <negge@mozilla.com>
Nico Weber <thakis@chromium.org>
@@ -111,12 +118,15 @@ Paul Wilkins <paulwilkins@google.com>
Pavol Rusnak <stick@gk2.sk>
Paweł Hajdan <phajdan@google.com>
Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@google.com>
+Peter Boström <pbos@chromium.org>
+Peter Collingbourne <pcc@chromium.org>
Peter de Rivaz <peter.derivaz@gmail.com>
Philip Jägenstedt <philipj@opera.com>
Priit Laes <plaes@plaes.org>
Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
Rafaël Carré <funman@videolan.org>
+Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+Rahul Chaudhry <rahulchaudhry@google.com>
Ralph Giles <giles@xiph.org>
Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
Rob Bradford <rob@linux.intel.com>
@@ -131,9 +141,11 @@ Sean McGovern <gseanmcg@gmail.com>
Sergey Kolomenkin <kolomenkin@gmail.com>
Sergey Ulanov <sergeyu@chromium.org>
Shimon Doodkin <helpmepro1@gmail.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
Shunyao Li <shunyaoli@google.com>
Stefan Holmer <holmer@google.com>
Suman Sunkara <sunkaras@google.com>
+Sylvestre Ledru <sylvestre@mozilla.com>
Taekhyun Kim <takim@nvidia.com>
Takanori MATSUURA <t.matsuu@gmail.com>
Tamar Levy <tamar.levy@intel.com>
@@ -146,6 +158,7 @@ Tom Finegan <tomfinegan@google.com>
Tristan Matthews <le.businessman@gmail.com>
Urvang Joshi <urvang@google.com>
Vignesh Venkatasubramanian <vigneshv@google.com>
+Vlad Tsyrklevich <vtsyrklevich@chromium.org>
Yaowu Xu <yaowu@google.com>
Yi Luo <luoyi@google.com>
Yongzhe Wang <yongzhe@google.com>
diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG
index 7e7aec67a..2281394c8 100644
--- a/libvpx/CHANGELOG
+++ b/libvpx/CHANGELOG
@@ -1,3 +1,28 @@
+2017-01-04 v1.7.0 "Mandarin Duck"
+ This release focused on high bit depth performance (10/12 bit) and vp9
+ encoding improvements.
+
+ - Upgrading:
+ This release is ABI incompatible due to new vp9 encoder features.
+
+ Frame parallel decoding for vp9 has been removed.
+
+ - Enhancements:
+ vp9 encoding supports additional threads with --row-mt. This can be greater
+ than the number of tiles.
+
+ Two new vp9 encoder options have been added:
+ --corpus-complexity
+ --tune-content=film
+
+ Additional tooling for respecting the vp9 "level" profiles has been added.
+
+ - Bug fixes:
+ A variety of fuzzing issues.
+ vp8 threading fix for ARM.
+ Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
+ Reject invalid multi resolution configurations.
+
2017-01-09 v1.6.1 "Long Tailed Duck"
This release improves upon the VP9 encoder and speeds up the encoding and
decoding processes.
diff --git a/libvpx/README b/libvpx/README
index f910ce761..73304dd62 100644
--- a/libvpx/README
+++ b/libvpx/README
@@ -1,4 +1,4 @@
-README - 26 January 2017
+README - 24 January 2018
Welcome to the WebM VP8/VP9 Codec SDK!
@@ -63,6 +63,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
armv8-linux-gcc
mips32-linux-gcc
mips64-linux-gcc
+ ppc64-linux-gcc
+ ppc64le-linux-gcc
sparc-solaris-gcc
x86-android-gcc
x86-darwin8-gcc
diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile
index 90522e5f6..f6b3f0630 100644
--- a/libvpx/build/make/Makefile
+++ b/libvpx/build/make/Makefile
@@ -139,6 +139,8 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
# POWER
$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index fbe8b1b45..683b43037 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -403,6 +403,23 @@ check_gcc_machine_option() {
fi
}
+# tests for -m$2, -m$3, -m$4... toggling the feature given in $1.
+check_gcc_machine_options() {
+ feature="$1"
+ shift
+ flags="-m$1"
+ shift
+ for opt in $*; do
+ flags="$flags -m$opt"
+ done
+
+ if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+ else
+ soft_enable "$feature"
+ fi
+}
+
write_common_config_banner() {
print_webm_license config.mk "##" ""
echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -702,6 +719,12 @@ process_common_toolchain() {
power*)
tgt_isa=ppc
;;
+ *mips64el*)
+ tgt_isa=mips64
+ ;;
+ *mips32el*)
+ tgt_isa=mips32
+ ;;
esac
# detect tgt_os
@@ -1163,6 +1186,11 @@ EOF
fi
fi
+ if enabled mmi; then
+ tgt_isa=loongson3a
+ check_add_ldflags -march=loongson3a
+ fi
+
check_add_cflags -march=${tgt_isa}
check_add_asflags -march=${tgt_isa}
check_add_asflags -KPIC
@@ -1227,6 +1255,13 @@ EOF
msvs_arch_dir=x86-msvs
vc_version=${tgt_cc##vs}
case $vc_version in
+ 7|8|9|10|11|12|13|14)
+ echo "${tgt_cc} does not support avx512, disabling....."
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
+ soft_disable avx512
+ ;;
+ esac
+ case $vc_version in
7|8|9|10)
echo "${tgt_cc} does not support avx/avx2, disabling....."
RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
@@ -1270,9 +1305,18 @@ EOF
elif disabled $ext; then
disable_exts="yes"
else
- # use the shortened version for the flag: sse4_1 -> sse4
- check_gcc_machine_option ${ext%_*} $ext
+ if [ "$ext" = "avx512" ]; then
+ check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl
+ else
+ # use the shortened version for the flag: sse4_1 -> sse4
+ check_gcc_machine_option ${ext%_*} $ext
+ fi
fi
+
+ # https://bugs.chromium.org/p/webm/issues/detail?id=1464
+ # The assembly optimizations for vpx_sub_pixel_variance do not link with
+ # gcc 6.
+ enabled sse2 && soft_enable pic
done
if enabled external_build; then
@@ -1297,7 +1341,6 @@ EOF
esac
log_echo " using $AS"
fi
- [ "${AS##*/}" = nasm ] && add_asflags -Ox
AS_SFX=.asm
case ${tgt_os} in
win32)
@@ -1306,7 +1349,7 @@ EOF
EXE_SFX=.exe
;;
win64)
- add_asflags -f x64
+ add_asflags -f win64
enabled debug && add_asflags -g cv8
EXE_SFX=.exe
;;
@@ -1440,6 +1483,10 @@ EOF
echo "msa optimizations are available only for little endian platforms"
disable_feature msa
fi
+ if enabled mmi; then
+ echo "mmi optimizations are available only for little endian platforms"
+ disable_feature mmi
+ fi
fi
;;
esac
diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh
index 8b68038b3..401223a0b 100755
--- a/libvpx/build/make/gen_msvs_sln.sh
+++ b/libvpx/build/make/gen_msvs_sln.sh
@@ -240,10 +240,10 @@ case "${vs_ver:-10}" in
12) sln_vers="12.00"
sln_vers_str="Visual Studio 2013"
;;
- 14) sln_vers="14.00"
+ 14) sln_vers="12.00"
sln_vers_str="Visual Studio 2015"
;;
- 15) sln_vers="15.00"
+ 15) sln_vers="12.00"
sln_vers_str="Visual Studio 2017"
;;
esac
diff --git a/libvpx/build/make/rtcd.pl b/libvpx/build/make/rtcd.pl
index ce88e6480..68e92b52c 100755
--- a/libvpx/build/make/rtcd.pl
+++ b/libvpx/build/make/rtcd.pl
@@ -1,4 +1,13 @@
#!/usr/bin/env perl
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
no strict 'refs';
use warnings;
@@ -200,6 +209,7 @@ sub filter {
sub common_top() {
my $include_guard = uc($opts{sym})."_H_";
print <<EOF;
+// This file is generated. Do not edit.
#ifndef ${include_guard}
#define ${include_guard}
@@ -391,10 +401,10 @@ EOF
&require("c");
if ($opts{arch} eq 'x86') {
- @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+ @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
x86;
} elsif ($opts{arch} eq 'x86_64') {
- @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+ @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
@REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
&require(@REQUIRES);
x86;
@@ -411,6 +421,10 @@ if ($opts{arch} eq 'x86') {
@ALL_ARCHS = filter("$opts{arch}", qw/msa/);
last;
}
+ if (/HAVE_MMI=yes/) {
+ @ALL_ARCHS = filter("$opts{arch}", qw/mmi/);
+ last;
+ }
}
close CONFIG_FILE;
mips;
diff --git a/libvpx/build/make/version.sh b/libvpx/build/make/version.sh
index 696752777..f36ede10f 100755
--- a/libvpx/build/make/version.sh
+++ b/libvpx/build/make/version.sh
@@ -60,6 +60,7 @@ if [ ${bare} ]; then
echo "${changelog_version}${git_version_id}" > $$.tmp
else
cat<<EOF>$$.tmp
+// This file is generated. Do not edit.
#define VERSION_MAJOR $major_version
#define VERSION_MINOR $minor_version
#define VERSION_PATCH $patch_version
diff --git a/libvpx/configure b/libvpx/configure
index 090d3fb1e..e5a74c6f2 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -170,11 +170,14 @@ for t in ${all_targets}; do
[ -f "${source_path}/${t}.mk" ] && enable_feature ${t}
done
+if ! diff --version >/dev/null; then
+ die "diff missing: Try installing diffutils via your package manager."
+fi
+
if ! perl --version >/dev/null; then
die "Perl is required to build"
fi
-
if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then
# test to see if source_path already configured
if [ -f "${source_path}/vpx_config.h" ]; then
@@ -241,7 +244,13 @@ ARCH_EXT_LIST_X86="
sse4_1
avx
avx2
+ avx512
"
+
+ARCH_EXT_LIST_LOONGSON="
+ mmi
+"
+
ARCH_EXT_LIST="
neon
neon_asm
@@ -254,6 +263,8 @@ ARCH_EXT_LIST="
${ARCH_EXT_LIST_X86}
vsx
+
+ ${ARCH_EXT_LIST_LOONGSON}
"
HAVE_LIST="
${ARCH_EXT_LIST}
@@ -319,6 +330,7 @@ CONFIG_LIST="
better_hw_compatibility
experimental
size_limit
+ always_adjust_bpm
${EXPERIMENT_LIST}
"
CMDLINE_SELECT="
@@ -378,6 +390,7 @@ CMDLINE_SELECT="
better_hw_compatibility
vp9_highbitdepth
experimental
+ always_adjust_bpm
"
process_cmdline() {
@@ -579,6 +592,7 @@ process_toolchain() {
check_add_cflags -Wdeclaration-after-statement
check_add_cflags -Wdisabled-optimization
check_add_cflags -Wfloat-conversion
+ check_add_cflags -Wparentheses-equality
check_add_cflags -Wpointer-arith
check_add_cflags -Wtype-limits
check_add_cflags -Wcast-qual
@@ -651,7 +665,7 @@ process_toolchain() {
gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
all_targets="${all_targets} solution"
- INLINE="__forceinline"
+ INLINE="__inline"
;;
esac
diff --git a/libvpx/examples/vp8_multi_resolution_encoder.c b/libvpx/examples/vp8_multi_resolution_encoder.c
index 0b9663c77..b14b1ff39 100644
--- a/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -151,7 +151,7 @@ static void write_ivf_frame_header(FILE *outfile,
if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
pts = pkt->data.frame.pts;
- mem_put_le32(header, pkt->data.frame.sz);
+ mem_put_le32(header, (int)pkt->data.frame.sz);
mem_put_le32(header + 4, pts & 0xFFFFFFFF);
mem_put_le32(header + 8, pts >> 32);
@@ -190,7 +190,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
cfg->ts_layer_id[0] = 0;
cfg->ts_layer_id[1] = 1;
// Use 60/40 bit allocation as example.
- cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+ cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate);
cfg->ts_target_bitrate[1] = bitrate;
/* 0=L, 1=GF */
@@ -241,8 +241,8 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
cfg->ts_layer_id[2] = 1;
cfg->ts_layer_id[3] = 2;
// Use 45/20/35 bit allocation as example.
- cfg->ts_target_bitrate[0] = 0.45f * bitrate;
- cfg->ts_target_bitrate[1] = 0.65f * bitrate;
+ cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate);
+ cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate);
cfg->ts_target_bitrate[2] = bitrate;
/* 0=L, 1=GF, 2=ARF */
@@ -294,8 +294,8 @@ int main(int argc, char **argv) {
vpx_codec_err_t res[NUM_ENCODERS];
int i;
- long width;
- long height;
+ int width;
+ int height;
int length_frame;
int frame_avail;
int got_data;
@@ -347,9 +347,9 @@ int main(int argc, char **argv) {
printf("Using %s\n", vpx_codec_iface_name(interface));
- width = strtol(argv[1], NULL, 0);
- height = strtol(argv[2], NULL, 0);
- framerate = strtol(argv[3], NULL, 0);
+ width = (int)strtol(argv[1], NULL, 0);
+ height = (int)strtol(argv[2], NULL, 0);
+ framerate = (int)strtol(argv[3], NULL, 0);
if (width < 16 || width % 2 || height < 16 || height % 2)
die("Invalid resolution: %ldx%ld", width, height);
@@ -371,12 +371,13 @@ int main(int argc, char **argv) {
// Bitrates per spatial layer: overwrite default rates above.
for (i = 0; i < NUM_ENCODERS; i++) {
- target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
+ target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
}
// Temporal layers per spatial layers: overwrite default settings above.
for (i = 0; i < NUM_ENCODERS; i++) {
- num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+ num_temporal_layers[i] =
+ (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
num_temporal_layers);
@@ -391,9 +392,9 @@ int main(int argc, char **argv) {
downsampled_input[i] = fopen(filename, "wb");
}
- key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+ key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
- show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+ show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
/* Populate default encoder configuration */
for (i = 0; i < NUM_ENCODERS; i++) {
@@ -469,7 +470,7 @@ int main(int argc, char **argv) {
if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
- if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+ if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
read_frame_p = read_frame;
else
read_frame_p = read_frame_by_row;
@@ -558,7 +559,8 @@ int main(int argc, char **argv) {
/* Write out down-sampled input. */
length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2;
if (fwrite(raw[i].planes[0], 1, length_frame,
- downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) {
+ downsampled_input[NUM_ENCODERS - i - 1]) !=
+ (unsigned int)length_frame) {
return EXIT_FAILURE;
}
}
@@ -619,10 +621,6 @@ int main(int argc, char **argv) {
break;
default: break;
}
- printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
- (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)
- ? "K"
- : "");
fflush(stdout);
}
}
@@ -663,7 +661,6 @@ int main(int argc, char **argv) {
write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1);
fclose(outfile[i]);
}
- printf("\n");
return EXIT_SUCCESS;
}
diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c
index 1f5078aad..0987cbfb8 100644
--- a/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -168,7 +168,7 @@ void usage_exit(void) {
static void parse_command_line(int argc, const char **argv_,
AppInput *app_input, SvcContext *svc_ctx,
vpx_codec_enc_cfg_t *enc_cfg) {
- struct arg arg = { 0 };
+ struct arg arg;
char **argv = NULL;
char **argi = NULL;
char **argj = NULL;
@@ -509,7 +509,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
}
vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
- uint32_t sizes[8], int *count) {
+ uint64_t sizes[8], int *count) {
// A chunk ending with a byte matching 0xc0 is an invalid chunk unless
// it is a super frame index. If the last byte of real video compression
// data is 0xc0 the encoder must add a 0 byte. If we have the marker but
@@ -606,9 +606,9 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
}
int main(int argc, const char **argv) {
- AppInput app_input = { 0 };
+ AppInput app_input;
VpxVideoWriter *writer = NULL;
- VpxVideoInfo info = { 0 };
+ VpxVideoInfo info;
vpx_codec_ctx_t codec;
vpx_codec_enc_cfg_t enc_cfg;
SvcContext svc_ctx;
@@ -640,8 +640,9 @@ int main(int argc, const char **argv) {
// Allocate image buffer
#if CONFIG_VP9_HIGHBITDEPTH
- if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
- : VPX_IMG_FMT_I42016,
+ if (!vpx_img_alloc(&raw,
+ enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
+ : VPX_IMG_FMT_I42016,
enc_cfg.g_w, enc_cfg.g_h, 32)) {
die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
}
@@ -699,12 +700,16 @@ int main(int argc, const char **argv) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
if (svc_ctx.threads) {
vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
- vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
+ if (svc_ctx.threads > 1)
+ vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+ else
+ vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
}
if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
if (svc_ctx.speed >= 5)
vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+ vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
// Encode frames
while (!end_of_stream) {
@@ -765,7 +770,7 @@ int main(int argc, const char **argv) {
SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
if (cx_pkt->data.frame.sz > 0) {
#if OUTPUT_RC_STATS
- uint32_t sizes[8];
+ uint64_t sizes[8];
int count = 0;
#endif
vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf,
@@ -777,6 +782,8 @@ int main(int argc, const char **argv) {
vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
parse_superframe_index(cx_pkt->data.frame.buf,
cx_pkt->data.frame.sz, sizes, &count);
+ if (enc_cfg.ss_number_layers == 1)
+ sizes[0] = cx_pkt->data.frame.sz;
// Note computing input_layer_frames here won't account for frame
// drops in rate control stats.
// TODO(marpan): Fix this for non-bypass mode so we can get stats
diff --git a/libvpx/examples/vpx_temporal_svc_encoder.c b/libvpx/examples/vpx_temporal_svc_encoder.c
index c34673b05..f5736ea45 100644
--- a/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -26,17 +26,27 @@
#include "../tools_common.h"
#include "../video_writer.h"
+#define VP8_ROI_MAP 0
+
static const char *exec_name;
void usage_exit(void) { exit(EXIT_FAILURE); }
-// Denoiser states, for temporal denoising.
-enum denoiserState {
- kDenoiserOff,
- kDenoiserOnYOnly,
- kDenoiserOnYUV,
- kDenoiserOnYUVAggressive,
- kDenoiserOnAdaptive
+// Denoiser states for vp8, for temporal denoising.
+enum denoiserStateVp8 {
+ kVp8DenoiserOff,
+ kVp8DenoiserOnYOnly,
+ kVp8DenoiserOnYUV,
+ kVp8DenoiserOnYUVAggressive,
+ kVp8DenoiserOnAdaptive
+};
+
+// Denoiser states for vp9, for temporal denoising.
+enum denoiserStateVp9 {
+ kVp9DenoiserOff,
+ kVp9DenoiserOnYOnly,
+ // For SVC: denoise the top two spatial layers.
+ kVp9DenoiserOnYTwoSpatialLayers
};
static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@@ -154,6 +164,53 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
die("Error: Number of input frames not equal to output! \n");
}
+#if VP8_ROI_MAP
+static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
+ unsigned int i, j;
+ memset(roi, 0, sizeof(*roi));
+
+ // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
+ // segment is 16x16 for vp8, 8x8 for vp9.
+ roi->rows = (cfg->g_h + 15) / 16;
+ roi->cols = (cfg->g_w + 15) / 16;
+
+ // Applies delta QP on the segment blocks, varies from -63 to 63.
+ // Setting to negative means lower QP (better quality).
+ // Below we set delta_q to the extreme (-63) to show strong effect.
+ roi->delta_q[0] = 0;
+ roi->delta_q[1] = -63;
+ roi->delta_q[2] = 0;
+ roi->delta_q[3] = 0;
+
+ // Applies delta loopfilter strength on the segment blocks, varies from -63 to
+ // 63. Setting to positive means stronger loopfilter.
+ roi->delta_lf[0] = 0;
+ roi->delta_lf[1] = 0;
+ roi->delta_lf[2] = 0;
+ roi->delta_lf[3] = 0;
+
+ // Applies skip encoding threshold on the segment blocks, varies from 0 to
+ // UINT_MAX. Larger value means more skipping of encoding is possible.
+ // This skip threshold only applies on delta frames.
+ roi->static_threshold[0] = 0;
+ roi->static_threshold[1] = 0;
+ roi->static_threshold[2] = 0;
+ roi->static_threshold[3] = 0;
+
+ // Use 2 states: 1 is center square, 0 is the rest.
+ roi->roi_map =
+ (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+ for (i = 0; i < roi->rows; ++i) {
+ for (j = 0; j < roi->cols; ++j) {
+ if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) &&
+ j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) {
+ roi->roi_map[i * roi->cols + j] = 1;
+ }
+ }
+ }
+}
+#endif
+
// Temporal scaling parameters:
// NOTE: The 3 prediction frames cannot be used interchangeably due to
// differences in the way they are handled throughout the code. The
@@ -506,11 +563,10 @@ int main(int argc, char **argv) {
int layering_mode = 0;
int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
int flag_periodicity = 1;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
- vpx_svc_layer_id_t layer_id = { 0, 0 };
-#else
- vpx_svc_layer_id_t layer_id = { 0 };
+#if VP8_ROI_MAP
+ vpx_roi_map_t roi;
#endif
+ vpx_svc_layer_id_t layer_id = { 0, 0 };
const VpxInterface *encoder = NULL;
FILE *infile = NULL;
struct RateControlMetrics rc;
@@ -637,7 +693,7 @@ int main(int argc, char **argv) {
if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52;
cfg.rc_undershoot_pct = 50;
cfg.rc_overshoot_pct = 50;
- cfg.rc_buf_initial_sz = 500;
+ cfg.rc_buf_initial_sz = 600;
cfg.rc_buf_optimal_sz = 600;
cfg.rc_buf_sz = 1000;
@@ -707,9 +763,15 @@ int main(int argc, char **argv) {
if (strncmp(encoder->name, "vp8", 3) == 0) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
- vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+ vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
+#if VP8_ROI_MAP
+ vp8_set_roi_map(&cfg, &roi);
+ if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
+ die_codec(&codec, "Failed to set ROI map");
+#endif
+
} else if (strncmp(encoder->name, "vp9", 3) == 0) {
vpx_svc_extra_cfg_t svc_params;
memset(&svc_params, 0, sizeof(svc_params));
@@ -718,7 +780,7 @@ int main(int argc, char **argv) {
vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
- vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+ vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
@@ -746,7 +808,7 @@ int main(int argc, char **argv) {
// For generating smaller key frames, use a smaller max_intra_size_pct
// value, like 100 or 200.
{
- const int max_intra_size_pct = 900;
+ const int max_intra_size_pct = 1000;
vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
max_intra_size_pct);
}
@@ -756,10 +818,8 @@ int main(int argc, char **argv) {
struct vpx_usec_timer timer;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
// Update the temporal layer_id. No spatial layers in this test.
layer_id.spatial_layer_id = 0;
-#endif
layer_id.temporal_layer_id =
cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
if (strncmp(encoder->name, "vp9", 3) == 0) {
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index f1e924253..a3e2f9d0e 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -188,6 +188,13 @@ libvpx_srcs.txt:
@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
CLEAN-OBJS += libvpx_srcs.txt
+# Assembly files that are included, but don't define symbols themselves.
+# Filtered out to avoid Windows build warnings.
+ASM_INCLUDES := \
+ third_party/x86inc/x86inc.asm \
+ vpx_config.asm \
+ vpx_ports/x86_abi_support.asm \
+ vpx_dsp/x86/bitdepth_conversion_sse2.asm \
ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
ifeq ($(CONFIG_MSVS),yes)
@@ -199,14 +206,6 @@ vpx.def: $(call enabled,CODEC_EXPORTS)
--out=$@ $^
CLEAN-OBJS += vpx.def
-# Assembly files that are included, but don't define symbols themselves.
-# Filtered out to avoid Visual Studio build warnings.
-ASM_INCLUDES := \
- third_party/x86inc/x86inc.asm \
- vpx_config.asm \
- vpx_ports/x86_abi_support.asm \
- vpx_dsp/x86/bitdepth_conversion_sse2.asm \
-
vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
@echo " [CREATE] $@"
$(qexec)$(GEN_VCPROJ) \
@@ -229,13 +228,13 @@ vpx.$(VCPROJ_SFX): $(RTCD)
endif
else
-LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
+LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS)))
OBJS-yes += $(LIBVPX_OBJS)
LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
$(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
-SO_VERSION_MAJOR := 4
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 5
+SO_VERSION_MINOR := 0
SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -406,8 +405,16 @@ CLEAN-OBJS += libvpx_test_srcs.txt
$(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
@echo " [DOWNLOAD] $@"
- $(qexec)trap 'rm -f $@' INT TERM &&\
- curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))
+ # Attempt to download the file using curl, retrying once if it fails for a
+ # partial file (18).
+ $(qexec)( \
+ trap 'rm -f $@' INT TERM; \
+ curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \
+ $$curl; \
+ case "$$?" in \
+ 18) $$curl -C -;; \
+ esac \
+ )
testdata:: $(LIBVPX_TEST_DATA)
$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h
index c2f6b0e41..d915cf913 100644
--- a/libvpx/test/acm_random.h
+++ b/libvpx/test/acm_random.h
@@ -11,6 +11,10 @@
#ifndef TEST_ACM_RANDOM_H_
#define TEST_ACM_RANDOM_H_
+#include <assert.h>
+
+#include <limits>
+
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "vpx/vpx_integer.h"
@@ -50,6 +54,13 @@ class ACMRandom {
return r < 128 ? r << 4 : r >> 4;
}
+ uint32_t RandRange(const uint32_t range) {
+ // testing::internal::Random::Generate provides values in the range
+ // testing::internal::Random::kMaxRange.
+ assert(range <= testing::internal::Random::kMaxRange);
+ return random_.Generate(range);
+ }
+
int PseudoUniform(int range) { return random_.Generate(range); }
int operator()(int n) { return PseudoUniform(n); }
diff --git a/libvpx/test/android/Android.mk b/libvpx/test/android/Android.mk
index 48872a2b6..7318de2fc 100644
--- a/libvpx/test/android/Android.mk
+++ b/libvpx/test/android/Android.mk
@@ -32,6 +32,7 @@ LOCAL_CPP_EXTENSION := .cc
LOCAL_MODULE := gtest
LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
include $(BUILD_STATIC_LIBRARY)
diff --git a/libvpx/test/avg_test.cc b/libvpx/test/avg_test.cc
index c570bbc22..ad21198e4 100644
--- a/libvpx/test/avg_test.cc
+++ b/libvpx/test/avg_test.cc
@@ -23,6 +23,7 @@
#include "test/register_state_check.h"
#include "test/util.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
using libvpx_test::ACMRandom;
@@ -367,6 +368,21 @@ TEST_P(SatdTest, Random) {
Check(expected);
}
+TEST_P(SatdTest, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 20000;
+ vpx_usec_timer timer;
+ DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+ const int blocksize = GET_PARAM(0);
+
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ GET_PARAM(1)(coeff, blocksize);
+ }
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
TEST_P(BlockErrorTestFP, MinValue) {
const int64_t kMin = -32640;
const int64_t expected = kMin * kMin * txfm_size_;
@@ -396,6 +412,22 @@ TEST_P(BlockErrorTestFP, Random) {
Check(expected);
}
+TEST_P(BlockErrorTestFP, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 20000;
+ vpx_usec_timer timer;
+ DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]);
+ const int blocksize = GET_PARAM(0);
+
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ GET_PARAM(1)(coeff, dqcoeff, blocksize);
+ }
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
using std::tr1::make_tuple;
INSTANTIATE_TEST_CASE_P(
@@ -454,6 +486,21 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(1024, &vp9_block_error_fp_sse2)));
#endif // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+ ::testing::Values(make_tuple(16, &vpx_satd_avx2),
+ make_tuple(64, &vpx_satd_avx2),
+ make_tuple(256, &vpx_satd_avx2),
+ make_tuple(1024, &vpx_satd_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+ AVX2, BlockErrorTestFP,
+ ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
+ make_tuple(64, &vp9_block_error_fp_avx2),
+ make_tuple(256, &vp9_block_error_fp_avx2),
+ make_tuple(1024, &vp9_block_error_fp_avx2)));
+#endif
+
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
NEON, AverageTest,
diff --git a/libvpx/test/buffer.h b/libvpx/test/buffer.h
index 75016c91e..2175dad9d 100644
--- a/libvpx/test/buffer.h
+++ b/libvpx/test/buffer.h
@@ -19,6 +19,7 @@
#include "test/acm_random.h"
#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
namespace libvpx_test {
@@ -29,29 +30,55 @@ class Buffer {
int right_padding, int bottom_padding)
: width_(width), height_(height), top_padding_(top_padding),
left_padding_(left_padding), right_padding_(right_padding),
- bottom_padding_(bottom_padding) {
- Init();
- }
+ bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
+ stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+ Buffer(int width, int height, int top_padding, int left_padding,
+ int right_padding, int bottom_padding, unsigned int alignment)
+ : width_(width), height_(height), top_padding_(top_padding),
+ left_padding_(left_padding), right_padding_(right_padding),
+ bottom_padding_(bottom_padding), alignment_(alignment),
+ padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
+ raw_buffer_(NULL) {}
Buffer(int width, int height, int padding)
: width_(width), height_(height), top_padding_(padding),
left_padding_(padding), right_padding_(padding),
- bottom_padding_(padding) {
- Init();
- }
+ bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
+ raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
- ~Buffer() { delete[] raw_buffer_; }
+ Buffer(int width, int height, int padding, unsigned int alignment)
+ : width_(width), height_(height), top_padding_(padding),
+ left_padding_(padding), right_padding_(padding),
+ bottom_padding_(padding), alignment_(alignment), padding_value_(0),
+ stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+ ~Buffer() {
+ if (alignment_) {
+ vpx_free(raw_buffer_);
+ } else {
+ delete[] raw_buffer_;
+ }
+ }
T *TopLeftPixel() const;
int stride() const { return stride_; }
// Set the buffer (excluding padding) to 'value'.
- void Set(const int value);
+ void Set(const T value);
- // Set the buffer (excluding padding) to the output of ACMRandom function 'b'.
+ // Set the buffer (excluding padding) to the output of ACMRandom function
+ // 'rand_func'.
void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)());
+ // Set the buffer (excluding padding) to the output of ACMRandom function
+ // 'RandRange' with range 'low' to 'high' which typically must be within
+ // testing::internal::Random::kMaxRange (1u << 31). However, because we want
+ // to allow negative low (and high) values, it is restricted to INT32_MAX
+ // here.
+ void Set(ACMRandom *rand_class, const T low, const T high);
+
// Copy the contents of Buffer 'a' (excluding padding).
void CopyFrom(const Buffer<T> &a);
@@ -63,11 +90,11 @@ class Buffer {
bool HasPadding() const;
// Sets all the values in the buffer to 'padding_value'.
- void SetPadding(const int padding_value);
+ void SetPadding(const T padding_value);
// Checks if all the values (excluding padding) are equal to 'value' if the
// Buffers are the same size.
- bool CheckValues(const int value) const;
+ bool CheckValues(const T value) const;
// Check that padding matches the expected value or there is no padding.
bool CheckPadding() const;
@@ -75,21 +102,36 @@ class Buffer {
// Compare the non-padding portion of two buffers if they are the same size.
bool CheckValues(const Buffer<T> &a) const;
- private:
- void Init() {
- ASSERT_GT(width_, 0);
- ASSERT_GT(height_, 0);
- ASSERT_GE(top_padding_, 0);
- ASSERT_GE(left_padding_, 0);
- ASSERT_GE(right_padding_, 0);
- ASSERT_GE(bottom_padding_, 0);
+ bool Init() {
+ if (raw_buffer_ != NULL) return false;
+ EXPECT_GT(width_, 0);
+ EXPECT_GT(height_, 0);
+ EXPECT_GE(top_padding_, 0);
+ EXPECT_GE(left_padding_, 0);
+ EXPECT_GE(right_padding_, 0);
+ EXPECT_GE(bottom_padding_, 0);
stride_ = left_padding_ + width_ + right_padding_;
- raw_size_ = stride_ * (top_padding_ + height_ + bottom_padding_);
- raw_buffer_ = new (std::nothrow) T[raw_size_];
- ASSERT_TRUE(raw_buffer_ != NULL);
+ num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_);
+ raw_size_ = num_elements_ * sizeof(T);
+ if (alignment_) {
+ EXPECT_GE(alignment_, sizeof(T));
+ // Ensure alignment of the first value will be preserved.
+ EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u);
+ // Ensure alignment of the subsequent rows will be preserved when there is
+ // a stride.
+ if (stride_ != width_) {
+ EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u);
+ }
+ raw_buffer_ = reinterpret_cast<T *>(vpx_memalign(alignment_, raw_size_));
+ } else {
+ raw_buffer_ = new (std::nothrow) T[num_elements_];
+ }
+ EXPECT_TRUE(raw_buffer_ != NULL);
SetPadding(std::numeric_limits<T>::max());
+ return !::testing::Test::HasFailure();
}
+ private:
bool BufferSizesMatch(const Buffer<T> &a) const;
const int width_;
@@ -98,44 +140,70 @@ class Buffer {
const int left_padding_;
const int right_padding_;
const int bottom_padding_;
- int padding_value_;
+ const unsigned int alignment_;
+ T padding_value_;
int stride_;
int raw_size_;
+ int num_elements_;
T *raw_buffer_;
};
template <typename T>
T *Buffer<T>::TopLeftPixel() const {
- return raw_buffer_ + (top_padding_ * stride()) + left_padding_;
+ if (!raw_buffer_) return NULL;
+ return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
}
template <typename T>
-void Buffer<T>::Set(const int value) {
+void Buffer<T>::Set(const T value) {
+ if (!raw_buffer_) return;
T *src = TopLeftPixel();
for (int height = 0; height < height_; ++height) {
for (int width = 0; width < width_; ++width) {
src[width] = value;
}
- src += stride();
+ src += stride_;
}
}
template <typename T>
void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
+ if (!raw_buffer_) return;
T *src = TopLeftPixel();
for (int height = 0; height < height_; ++height) {
for (int width = 0; width < width_; ++width) {
src[width] = (*rand_class.*rand_func)();
}
- src += stride();
+ src += stride_;
}
}
template <typename T>
-void Buffer<T>::CopyFrom(const Buffer<T> &a) {
- if (!BufferSizesMatch(a)) {
- return;
+void Buffer<T>::Set(ACMRandom *rand_class, const T low, const T high) {
+ if (!raw_buffer_) return;
+
+ EXPECT_LE(low, high);
+ EXPECT_LE(static_cast<int64_t>(high) - low,
+ std::numeric_limits<int32_t>::max());
+
+ T *src = TopLeftPixel();
+ for (int height = 0; height < height_; ++height) {
+ for (int width = 0; width < width_; ++width) {
+ // 'low' will be promoted to unsigned given the return type of RandRange.
+ // Store the value as an int to avoid unsigned overflow warnings when
+ // 'low' is negative.
+ const int32_t value =
+ static_cast<int32_t>((*rand_class).RandRange(high - low));
+ src[width] = static_cast<T>(value + low);
+ }
+ src += stride_;
}
+}
+
+template <typename T>
+void Buffer<T>::CopyFrom(const Buffer<T> &a) {
+ if (!raw_buffer_) return;
+ if (!BufferSizesMatch(a)) return;
T *a_src = a.TopLeftPixel();
T *b_src = this->TopLeftPixel();
@@ -150,10 +218,11 @@ void Buffer<T>::CopyFrom(const Buffer<T> &a) {
template <typename T>
void Buffer<T>::DumpBuffer() const {
+ if (!raw_buffer_) return;
for (int height = 0; height < height_ + top_padding_ + bottom_padding_;
++height) {
- for (int width = 0; width < stride(); ++width) {
- printf("%4d", raw_buffer_[height + width * stride()]);
+ for (int width = 0; width < stride_; ++width) {
+ printf("%4d", raw_buffer_[height + width * stride_]);
}
printf("\n");
}
@@ -161,14 +230,14 @@ void Buffer<T>::DumpBuffer() const {
template <typename T>
bool Buffer<T>::HasPadding() const {
+ if (!raw_buffer_) return false;
return top_padding_ || left_padding_ || right_padding_ || bottom_padding_;
}
template <typename T>
void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
- if (!BufferSizesMatch(a)) {
- return;
- }
+ if (!raw_buffer_) return;
+ if (!BufferSizesMatch(a)) return;
T *a_src = a.TopLeftPixel();
T *b_src = TopLeftPixel();
@@ -206,17 +275,19 @@ void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
}
template <typename T>
-void Buffer<T>::SetPadding(const int padding_value) {
+void Buffer<T>::SetPadding(const T padding_value) {
+ if (!raw_buffer_) return;
padding_value_ = padding_value;
T *src = raw_buffer_;
- for (int i = 0; i < raw_size_; ++i) {
+ for (int i = 0; i < num_elements_; ++i) {
src[i] = padding_value;
}
}
template <typename T>
-bool Buffer<T>::CheckValues(const int value) const {
+bool Buffer<T>::CheckValues(const T value) const {
+ if (!raw_buffer_) return false;
T *src = TopLeftPixel();
for (int height = 0; height < height_; ++height) {
for (int width = 0; width < width_; ++width) {
@@ -224,20 +295,19 @@ bool Buffer<T>::CheckValues(const int value) const {
return false;
}
}
- src += stride();
+ src += stride_;
}
return true;
}
template <typename T>
bool Buffer<T>::CheckPadding() const {
- if (!HasPadding()) {
- return true;
- }
+ if (!raw_buffer_) return false;
+ if (!HasPadding()) return true;
// Top padding.
T const *top = raw_buffer_;
- for (int i = 0; i < stride() * top_padding_; ++i) {
+ for (int i = 0; i < stride_ * top_padding_; ++i) {
if (padding_value_ != top[i]) {
return false;
}
@@ -251,7 +321,7 @@ bool Buffer<T>::CheckPadding() const {
return false;
}
}
- left += stride();
+ left += stride_;
}
// Right padding.
@@ -262,12 +332,12 @@ bool Buffer<T>::CheckPadding() const {
return false;
}
}
- right += stride();
+ right += stride_;
}
// Bottom padding
- T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride();
- for (int i = 0; i < stride() * bottom_padding_; ++i) {
+ T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_;
+ for (int i = 0; i < stride_ * bottom_padding_; ++i) {
if (padding_value_ != bottom[i]) {
return false;
}
@@ -278,9 +348,8 @@ bool Buffer<T>::CheckPadding() const {
template <typename T>
bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
- if (!BufferSizesMatch(a)) {
- return false;
- }
+ if (!raw_buffer_) return false;
+ if (!BufferSizesMatch(a)) return false;
T *a_src = a.TopLeftPixel();
T *b_src = this->TopLeftPixel();
@@ -298,6 +367,7 @@ bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
template <typename T>
bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
+ if (!raw_buffer_) return false;
if (a.width_ != this->width_ || a.height_ != this->height_) {
printf(
"Reference buffer of size %dx%d does not match this buffer which is "
diff --git a/libvpx/test/byte_alignment_test.cc b/libvpx/test/byte_alignment_test.cc
index d78294d10..5a058b275 100644
--- a/libvpx/test/byte_alignment_test.cc
+++ b/libvpx/test/byte_alignment_test.cc
@@ -128,8 +128,8 @@ class ByteAlignmentTest
// TODO(fgalligan): Move the MD5 testing code into another class.
void OpenMd5File(const std::string &md5_file_name_) {
md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
- ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
- << md5_file_name_;
+ ASSERT_TRUE(md5_file_ != NULL)
+ << "MD5 file open failed. Filename: " << md5_file_name_;
}
void CheckMd5(const vpx_image_t &img) {
diff --git a/libvpx/test/comp_avg_pred_test.cc b/libvpx/test/comp_avg_pred_test.cc
index 3feba7127..110e06583 100644
--- a/libvpx/test/comp_avg_pred_test.cc
+++ b/libvpx/test/comp_avg_pred_test.cc
@@ -15,7 +15,6 @@
#include "test/acm_random.h"
#include "test/buffer.h"
#include "test/register_state_check.h"
-#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_timer.h"
namespace {
@@ -28,12 +27,13 @@ typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
-void reference_pred(const uint8_t *pred, const Buffer<uint8_t> &ref, int width,
- int height, uint8_t *avg) {
+void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
+ int width, int height, Buffer<uint8_t> *avg) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
- avg[y * width + x] = avg_with_rounding(
- pred[y * width + x], ref.TopLeftPixel()[y * ref.stride() + x]);
+ avg->TopLeftPixel()[y * avg->stride() + x] =
+ avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
+ ref.TopLeftPixel()[y * ref.stride() + x]);
}
}
}
@@ -50,22 +50,10 @@ class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
ACMRandom rnd_;
};
-void fill(ACMRandom *r, uint8_t *a, const int width, const int height) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- a[x + width * y] = r->Rand8();
- }
- }
-}
-
TEST_P(AvgPredTest, SizeCombinations) {
// This is called as part of the sub pixel variance. As such it must be one of
// the variance block sizes.
- DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
- DECLARE_ALIGNED(16, uint8_t, avg_ref[64 * 64]);
- DECLARE_ALIGNED(16, uint8_t, avg_chk[64 * 64]);
-
for (int width_pow = 2; width_pow <= 6; ++width_pow) {
for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
++height_pow) {
@@ -80,15 +68,28 @@ TEST_P(AvgPredTest, SizeCombinations) {
// Only the reference buffer may have a stride not equal to width.
Buffer<uint8_t> ref =
Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+ ASSERT_TRUE(ref.Init());
+ Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(pred.Init());
+ Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(avg_ref.Init());
+ Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(avg_chk.Init());
- fill(&rnd_, pred, width, height);
ref.Set(&rnd_, &ACMRandom::Rand8);
-
- reference_pred(pred, ref, width, height, avg_ref);
- ASM_REGISTER_STATE_CHECK(avg_pred_func_(
- avg_chk, pred, width, height, ref.TopLeftPixel(), ref.stride()));
- ASSERT_EQ(memcmp(avg_ref, avg_chk, sizeof(*avg_ref) * width * height),
- 0);
+ pred.Set(&rnd_, &ACMRandom::Rand8);
+
+ reference_pred(pred, ref, width, height, &avg_ref);
+ ASM_REGISTER_STATE_CHECK(
+ avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
+ height, ref.TopLeftPixel(), ref.stride()));
+
+ EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+ if (HasFailure()) {
+ printf("Width: %d Height: %d\n", width, height);
+ avg_chk.PrintDifference(avg_ref);
+ return;
+ }
}
}
}
@@ -98,25 +99,32 @@ TEST_P(AvgPredTest, CompareReferenceRandom) {
const int width = 64;
const int height = 32;
Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
- DECLARE_ALIGNED(16, uint8_t, pred[width * height]);
- DECLARE_ALIGNED(16, uint8_t, avg_ref[width * height]);
- DECLARE_ALIGNED(16, uint8_t, avg_chk[width * height]);
+ ASSERT_TRUE(ref.Init());
+ Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(pred.Init());
+ Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(avg_ref.Init());
+ Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(avg_chk.Init());
for (int i = 0; i < 500; ++i) {
- fill(&rnd_, pred, width, height);
ref.Set(&rnd_, &ACMRandom::Rand8);
+ pred.Set(&rnd_, &ACMRandom::Rand8);
- reference_pred(pred, ref, width, height, avg_ref);
- ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk, pred, width, height,
+ reference_pred(pred, ref, width, height, &avg_ref);
+ ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
+ pred.TopLeftPixel(), width, height,
ref.TopLeftPixel(), ref.stride()));
- ASSERT_EQ(memcmp(avg_ref, avg_chk, sizeof(*avg_ref) * width * height), 0);
+ EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+ if (HasFailure()) {
+ printf("Width: %d Height: %d\n", width, height);
+ avg_chk.PrintDifference(avg_ref);
+ return;
+ }
}
}
TEST_P(AvgPredTest, DISABLED_Speed) {
- DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
- DECLARE_ALIGNED(16, uint8_t, avg[64 * 64]);
-
for (int width_pow = 2; width_pow <= 6; ++width_pow) {
for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
++height_pow) {
@@ -128,15 +136,20 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
const int height = 1 << height_pow;
Buffer<uint8_t> ref =
Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+ ASSERT_TRUE(ref.Init());
+ Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(pred.Init());
+ Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+ ASSERT_TRUE(avg.Init());
- fill(&rnd_, pred, width, height);
ref.Set(&rnd_, &ACMRandom::Rand8);
+ pred.Set(&rnd_, &ACMRandom::Rand8);
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
for (int i = 0; i < 10000000 / (width * height); ++i) {
- avg_pred_func_(avg, pred, width, height, ref.TopLeftPixel(),
- ref.stride());
+ avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
+ ref.TopLeftPixel(), ref.stride());
}
vpx_usec_timer_mark(&timer);
@@ -156,6 +169,12 @@ INSTANTIATE_TEST_CASE_P(C, AvgPredTest,
INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest,
::testing::Values(&vpx_comp_avg_pred_sse2));
#endif // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest,
+ ::testing::Values(&vpx_comp_avg_pred_neon));
+#endif // HAVE_NEON
+
#if HAVE_VSX
INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest,
::testing::Values(&vpx_comp_avg_pred_vsx));
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index 535b9b07f..70f0b11a7 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -33,9 +33,9 @@ static const unsigned int kMaxDimension = 64;
typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h);
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h);
typedef void (*WrapperFilterBlock2d8Func)(
const uint8_t *src_ptr, const unsigned int src_stride,
@@ -550,7 +550,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
- UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+ UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
width, height);
}
vpx_usec_timer_mark(&timer);
@@ -570,7 +570,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
- UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+ UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
width, height);
}
vpx_usec_timer_mark(&timer);
@@ -580,12 +580,127 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
+TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
+ const uint8_t *const in = input();
+ uint8_t *const out = output();
+ const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+ const int kNumTests = 5000000;
+ const int width = Width();
+ const int height = Height();
+ vpx_usec_timer timer;
+
+ SetConstantInput(127);
+
+ vpx_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+ width, height);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("convolve_scale_%dx%d_%d: %d us\n", width, height,
+ UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
+ const uint8_t *const in = input();
+ uint8_t *const out = output();
+ const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+ const int kNumTests = 5000000;
+ const int width = Width();
+ const int height = Height();
+ vpx_usec_timer timer;
+
+ SetConstantInput(127);
+
+ vpx_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+ width, height);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("convolve8_%dx%d_%d: %d us\n", width, height,
+ UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
+ const uint8_t *const in = input();
+ uint8_t *const out = output();
+ const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+ const int kNumTests = 5000000;
+ const int width = Width();
+ const int height = Height();
+ vpx_usec_timer timer;
+
+ SetConstantInput(127);
+
+ vpx_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+ width, height);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
+ UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
+ const uint8_t *const in = input();
+ uint8_t *const out = output();
+ const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+ const int kNumTests = 5000000;
+ const int width = Width();
+ const int height = Height();
+ vpx_usec_timer timer;
+
+ SetConstantInput(127);
+
+ vpx_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+ width, height);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
+ UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
+ const uint8_t *const in = input();
+ uint8_t *const out = output();
+ const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+ const int kNumTests = 5000000;
+ const int width = Width();
+ const int height = Height();
+ vpx_usec_timer timer;
+
+ SetConstantInput(127);
+
+ vpx_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+ width, height);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
+ UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
TEST_P(ConvolveTest, Copy) {
uint8_t *const in = input();
uint8_t *const out = output();
ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
- NULL, 0, NULL, 0, Width(), Height()));
+ NULL, 0, 0, 0, 0, Width(), Height()));
CheckGuardBlocks();
@@ -604,7 +719,7 @@ TEST_P(ConvolveTest, Avg) {
CopyOutputToRef();
ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
- NULL, 0, NULL, 0, Width(), Height()));
+ NULL, 0, 0, 0, 0, Width(), Height()));
CheckGuardBlocks();
@@ -621,12 +736,10 @@ TEST_P(ConvolveTest, Avg) {
TEST_P(ConvolveTest, CopyHoriz) {
uint8_t *const in = input();
uint8_t *const out = output();
- DECLARE_ALIGNED(256, const int16_t,
- filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,
- filter8, 16, filter8, 16, Width(),
- Height()));
+ vp9_filter_kernels[0], 0, 16, 0, 16,
+ Width(), Height()));
CheckGuardBlocks();
@@ -641,12 +754,10 @@ TEST_P(ConvolveTest, CopyHoriz) {
TEST_P(ConvolveTest, CopyVert) {
uint8_t *const in = input();
uint8_t *const out = output();
- DECLARE_ALIGNED(256, const int16_t,
- filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,
- filter8, 16, filter8, 16, Width(),
- Height()));
+ vp9_filter_kernels[0], 0, 16, 0, 16,
+ Width(), Height()));
CheckGuardBlocks();
@@ -661,12 +772,10 @@ TEST_P(ConvolveTest, CopyVert) {
TEST_P(ConvolveTest, Copy2D) {
uint8_t *const in = input();
uint8_t *const out = output();
- DECLARE_ALIGNED(256, const int16_t,
- filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,
- filter8, 16, filter8, 16, Width(),
- Height()));
+ vp9_filter_kernels[0], 0, 16, 0, 16,
+ Width(), Height()));
CheckGuardBlocks();
@@ -702,7 +811,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
}
}
-const int16_t kInvalidFilter[8] = { 0 };
const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {
wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c
};
@@ -755,21 +863,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
Width(), Height(), UUT_->use_highbd_);
if (filter_x && filter_y)
- ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i](
- in, kInputStride, out, kOutputStride, filters[filter_x], 16,
- filters[filter_y], 16, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,
+ filter_x, 16, filter_y, 16, Width(), Height()));
else if (filter_y)
- ASM_REGISTER_STATE_CHECK(UUT_->v8_[i](
- in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
- filters[filter_y], 16, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,
+ 16, filter_y, 16, Width(), Height()));
else if (filter_x)
- ASM_REGISTER_STATE_CHECK(UUT_->h8_[i](
- in, kInputStride, out, kOutputStride, filters[filter_x], 16,
- kInvalidFilter, 16, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
+ filter_x, 16, 0, 16, Width(), Height()));
else
- ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](
- in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
- kInvalidFilter, 0, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,
+ kOutputStride, NULL, 0, 0,
+ 0, 0, Width(), Height()));
CheckGuardBlocks();
@@ -853,21 +961,21 @@ TEST_P(ConvolveTest, FilterExtremes) {
filters[filter_y], ref, kOutputStride,
Width(), Height(), UUT_->use_highbd_);
if (filter_x && filter_y)
- ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0](
- in, kInputStride, out, kOutputStride, filters[filter_x], 16,
- filters[filter_y], 16, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,
+ filter_x, 16, filter_y, 16, Width(), Height()));
else if (filter_y)
- ASM_REGISTER_STATE_CHECK(UUT_->v8_[0](
- in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
- filters[filter_y], 16, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,
+ 16, filter_y, 16, Width(), Height()));
else if (filter_x)
- ASM_REGISTER_STATE_CHECK(UUT_->h8_[0](
- in, kInputStride, out, kOutputStride, filters[filter_x], 16,
- kInvalidFilter, 16, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
+ filter_x, 16, 0, 16, Width(), Height()));
else
- ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](
- in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
- kInvalidFilter, 0, Width(), Height()));
+ ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,
+ kOutputStride, NULL, 0, 0,
+ 0, 0, Width(), Height()));
for (int y = 0; y < Height(); ++y) {
for (int x = 0; x < Width(); ++x)
@@ -886,45 +994,63 @@ TEST_P(ConvolveTest, FilterExtremes) {
/* This test exercises that enough rows and columns are filtered with every
possible initial fractional positions and scaling steps. */
+#if !CONFIG_VP9_HIGHBITDEPTH
+static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c,
+ vpx_scaled_avg_2d_c };
+
TEST_P(ConvolveTest, CheckScalingFiltering) {
uint8_t *const in = input();
uint8_t *const out = output();
- const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+ uint8_t ref[kOutputStride * kMaxDimension];
- SetConstantInput(127);
+ ::libvpx_test::ACMRandom prng;
+ for (int y = 0; y < Height(); ++y) {
+ for (int x = 0; x < Width(); ++x) {
+ const uint16_t r = prng.Rand8Extremes();
+ assign_val(in, y * kInputStride + x, r);
+ }
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+ const InterpKernel *const eighttap = vp9_filter_kernels[filter_type];
+ for (int frac = 0; frac < 16; ++frac) {
+ for (int step = 1; step <= 32; ++step) {
+ /* Test the horizontal and vertical filters in combination. */
+ scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap,
+ frac, step, frac, step, Width(), Height());
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap,
+ frac, step, frac, step, Width(), Height()));
- for (int frac = 0; frac < 16; ++frac) {
- for (int step = 1; step <= 32; ++step) {
- /* Test the horizontal and vertical filters in combination. */
- ASM_REGISTER_STATE_CHECK(
- UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac],
- step, eighttap[frac], step, Width(), Height()));
-
- CheckGuardBlocks();
-
- for (int y = 0; y < Height(); ++y) {
- for (int x = 0; x < Width(); ++x) {
- ASSERT_EQ(lookup(in, y * kInputStride + x),
- lookup(out, y * kOutputStride + x))
- << "x == " << x << ", y == " << y << ", frac == " << frac
- << ", step == " << step;
+ CheckGuardBlocks();
+
+ for (int y = 0; y < Height(); ++y) {
+ for (int x = 0; x < Width(); ++x) {
+ ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+ lookup(out, y * kOutputStride + x))
+ << "x == " << x << ", y == " << y << ", frac == " << frac
+ << ", step == " << step;
+ }
+ }
}
}
}
}
}
+#endif
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
-#define WRAP(func, bd) \
- void wrap_##func##_##bd( \
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \
- const int16_t *filter_y, int filter_y_stride, int w, int h) { \
- vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride, \
- reinterpret_cast<uint16_t *>(dst), dst_stride, filter_x, \
- filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
+#define WRAP(func, bd) \
+ void wrap_##func##_##bd( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride, \
+ reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \
}
#if HAVE_SSE2 && ARCH_X86_64
@@ -1161,8 +1287,8 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
#else // !CONFIG_VP9_HIGHBITDEPTH
const ConvolveFunctions convolve8_avx2(
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
- vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
- vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+ vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2,
+ vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
@@ -1206,7 +1332,7 @@ const ConvolveFunctions convolve8_neon(
vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon,
vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon,
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
- vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+ vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0);
const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) };
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -1233,7 +1359,7 @@ const ConvolveFunctions convolve8_msa(
vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa,
vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa,
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
- vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+ vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0);
const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc
index a120a88d2..31a8523d2 100644
--- a/libvpx/test/datarate_test.cc
+++ b/libvpx/test/datarate_test.cc
@@ -44,6 +44,7 @@ class DatarateTestLarge
denoiser_offon_test_ = 0;
denoiser_offon_period_ = -1;
gf_boost_ = 0;
+ use_roi_ = 0;
}
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -54,6 +55,12 @@ class DatarateTestLarge
encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
}
+#if CONFIG_VP8_ENCODER
+ if (use_roi_ == 1) {
+ encoder->Control(VP8E_SET_ROI_MAP, &roi_);
+ }
+#endif
+
if (denoiser_offon_test_) {
ASSERT_GT(denoiser_offon_period_, 0)
<< "denoiser_offon_period_ is not positive.";
@@ -91,8 +98,8 @@ class DatarateTestLarge
const bool key_frame =
(pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
if (!key_frame) {
- ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
- << pkt->data.frame.pts;
+ ASSERT_GE(bits_in_buffer_model_, 0)
+ << "Buffer Underrun at frame " << pkt->data.frame.pts;
}
const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
@@ -145,6 +152,8 @@ class DatarateTestLarge
int denoiser_offon_period_;
int set_cpu_used_;
int gf_boost_;
+ int use_roi_;
+ vpx_roi_map_t roi_;
};
#if CONFIG_TEMPORAL_DENOISING
@@ -258,14 +267,6 @@ TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
}
}
-// Disabled for tsan, see:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1049
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-#ifndef BUILDING_WITH_TSAN
TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
@@ -285,7 +286,6 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
<< " The datarate for the file missed the target!";
}
-#endif // !BUILDING_WITH_TSAN
class DatarateTestRealTime : public DatarateTestLarge {
public:
@@ -402,10 +402,6 @@ TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
}
}
-// Disabled for tsan, see:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1049
-
-#ifndef BUILDING_WITH_TSAN
TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
@@ -426,7 +422,67 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
<< " The datarate for the file missed the target!";
}
-#endif
+
+TEST_P(DatarateTestRealTime, RegionOfInterest) {
+ denoiser_on_ = 0;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ // Encode using multiple threads.
+ cfg_.g_threads = 2;
+
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 300);
+ cfg_.rc_target_bitrate = 450;
+ cfg_.g_w = 352;
+ cfg_.g_h = 288;
+
+ ResetModel();
+
+ // Set ROI parameters
+ use_roi_ = 1;
+ memset(&roi_, 0, sizeof(roi_));
+
+ roi_.rows = (cfg_.g_h + 15) / 16;
+ roi_.cols = (cfg_.g_w + 15) / 16;
+
+ roi_.delta_q[0] = 0;
+ roi_.delta_q[1] = -20;
+ roi_.delta_q[2] = 0;
+ roi_.delta_q[3] = 0;
+
+ roi_.delta_lf[0] = 0;
+ roi_.delta_lf[1] = -20;
+ roi_.delta_lf[2] = 0;
+ roi_.delta_lf[3] = 0;
+
+ roi_.static_threshold[0] = 0;
+ roi_.static_threshold[1] = 1000;
+ roi_.static_threshold[2] = 0;
+ roi_.static_threshold[3] = 0;
+
+ // Use 2 states: 1 is center square, 0 is the rest.
+ roi_.roi_map =
+ (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+ for (unsigned int i = 0; i < roi_.rows; ++i) {
+ for (unsigned int j = 0; j < roi_.cols; ++j) {
+ if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+ j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+ roi_.roi_map[i * roi_.cols + j] = 1;
+ }
+ }
+ }
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+ << " The datarate for the file exceeds the target!";
+
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+ << " The datarate for the file missed the target!";
+
+ free(roi_.roi_map);
+}
TEST_P(DatarateTestRealTime, GFBoost) {
denoiser_on_ = 0;
@@ -482,6 +538,7 @@ class DatarateTestVP9Large
}
denoiser_offon_test_ = 0;
denoiser_offon_period_ = -1;
+ frame_parallel_decoding_mode_ = 1;
}
//
@@ -561,6 +618,8 @@ class DatarateTestVP9Large
encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
+ encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+ frame_parallel_decoding_mode_);
if (cfg_.ts_number_layers > 1) {
if (video->frame() == 0) {
@@ -599,8 +658,8 @@ class DatarateTestVP9Large
duration * timebase_ * cfg_.rc_target_bitrate * 1000);
// Buffer should not go negative.
- ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
- << pkt->data.frame.pts;
+ ASSERT_GE(bits_in_buffer_model_, 0)
+ << "Buffer Underrun at frame " << pkt->data.frame.pts;
const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
@@ -641,6 +700,7 @@ class DatarateTestVP9Large
int denoiser_on_;
int denoiser_offon_test_;
int denoiser_offon_period_;
+ int frame_parallel_decoding_mode_;
};
// Check basic rate targeting for VBR mode with 0 lag.
@@ -659,7 +719,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) {
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
<< " The datarate for the file is lower than target by too much!";
- ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
<< " The datarate for the file is greater than target by too much!";
}
}
@@ -686,7 +746,37 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) {
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
<< " The datarate for the file is lower than target by too much!";
- ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
+ << " The datarate for the file is greater than target by too much!";
+ }
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag, with
+// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs
+// since error_resilience is off.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) {
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.g_error_resilient = 0;
+ cfg_.rc_end_usage = VPX_VBR;
+ // For non-zero lag, rate control will work (be within bounds) for
+ // real-time mode.
+ if (deadline_ == VPX_DL_REALTIME) {
+ cfg_.g_lag_in_frames = 15;
+ } else {
+ cfg_.g_lag_in_frames = 0;
+ }
+
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 300);
+ for (int i = 400; i <= 800; i += 400) {
+ cfg_.rc_target_bitrate = i;
+ ResetModel();
+ frame_parallel_decoding_mode_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
<< " The datarate for the file is greater than target by too much!";
}
}
@@ -715,6 +805,33 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
}
}
+// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode
+// off( and error_resilience off).
+TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 140);
+ for (int i = 150; i < 800; i += 200) {
+ cfg_.rc_target_bitrate = i;
+ ResetModel();
+ frame_parallel_decoding_mode_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+ << " The datarate for the file is greater than target by too much!";
+ }
+}
+
// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) {
cfg_.rc_buf_initial_sz = 500;
@@ -1099,16 +1216,17 @@ class DatarateOnePassCbrSvc
}
virtual void ResetModel() {
last_pts_ = 0;
- bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
- frame_number_ = 0;
- first_drop_ = 0;
- bits_total_ = 0;
duration_ = 0.0;
mismatch_psnr_ = 0.0;
mismatch_nframes_ = 0;
denoiser_on_ = 0;
tune_content_ = 0;
base_speed_setting_ = 5;
+ spatial_layer_id_ = 0;
+ temporal_layer_id_ = 0;
+ memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
+ memset(bits_total_, 0, sizeof(bits_total_));
+ memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
}
virtual void BeginPassHook(unsigned int /*pass*/) {}
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -1139,32 +1257,94 @@ class DatarateOnePassCbrSvc
timebase_ = static_cast<double>(tb.num) / tb.den;
duration_ = 0;
}
+
+ virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+ vpx_svc_layer_id_t layer_id;
+ encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+ spatial_layer_id_ = layer_id.spatial_layer_id;
+ temporal_layer_id_ = layer_id.temporal_layer_id;
+ // Update buffer with per-layer target frame bandwidth, this is done
+ // for every frame passed to the encoder (encoded or dropped).
+ // For temporal layers, update the cumulative buffer level.
+ for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+ for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+ const int layer = sl * number_temporal_layers_ + tl;
+ bits_in_buffer_model_[layer] +=
+ static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
+ }
+ }
+ }
+
+ vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
+ uint32_t sizes[8], int *count) {
+ uint8_t marker;
+ marker = *(data + data_sz - 1);
+ *count = 0;
+ if ((marker & 0xe0) == 0xc0) {
+ const uint32_t frames = (marker & 0x7) + 1;
+ const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+ const size_t index_sz = 2 + mag * frames;
+ // This chunk is marked as having a superframe index but doesn't have
+ // enough data for it, thus it's an invalid superframe index.
+ if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+ {
+ const uint8_t marker2 = *(data + data_sz - index_sz);
+ // This chunk is marked as having a superframe index but doesn't have
+ // the matching marker byte at the front of the index therefore it's an
+ // invalid chunk.
+ if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+ }
+ {
+ uint32_t i, j;
+ const uint8_t *x = &data[data_sz - index_sz + 1];
+ for (i = 0; i < frames; ++i) {
+ uint32_t this_sz = 0;
+
+ for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+ sizes[i] = this_sz;
+ }
+ *count = frames;
+ }
+ }
+ return VPX_CODEC_OK;
+ }
+
virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
- vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
- if (last_pts_ == 0) duration = 1;
- bits_in_buffer_model_ += static_cast<int64_t>(
- duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+ uint32_t sizes[8] = { 0 };
+ int count = 0;
+ last_pts_ = pkt->data.frame.pts;
const bool key_frame =
(pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
- if (!key_frame) {
- // TODO(marpan): This check currently fails for some of the SVC tests,
- // re-enable when issue (webm:1350) is resolved.
- // ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
- // << pkt->data.frame.pts;
+ parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz, sizes, &count);
+ ASSERT_EQ(count, number_spatial_layers_);
+ for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+ sizes[sl] = sizes[sl] << 3;
+ // Update the total encoded bits per layer.
+ // For temporal layers, update the cumulative encoded bits per layer.
+ for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+ const int layer = sl * number_temporal_layers_ + tl;
+ bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
+ // Update the per-layer buffer level with the encoded frame size.
+ bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
+ // There should be no buffer underrun, except on the base
+ // temporal layer, since there may be key frames there.
+ if (!key_frame && tl > 0) {
+ ASSERT_GE(bits_in_buffer_model_[layer], 0)
+ << "Buffer Underrun at frame " << pkt->data.frame.pts;
+ }
+ }
}
- const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
- bits_in_buffer_model_ -= static_cast<int64_t>(frame_size_in_bits);
- bits_total_ += frame_size_in_bits;
- if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
- last_pts_ = pkt->data.frame.pts;
- bits_in_last_frame_ = frame_size_in_bits;
- ++frame_number_;
}
+
virtual void EndPassHook(void) {
- if (bits_total_) {
- const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit
- duration_ = (last_pts_ + 1) * timebase_;
- file_datarate_ = file_size_in_kb / duration_;
+ for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+ for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+ const int layer = sl * number_temporal_layers_ + tl;
+ const double file_size_in_kb = bits_total_[layer] / 1000.;
+ duration_ = (last_pts_ + 1) * timebase_;
+ file_datarate_[layer] = file_size_in_kb / duration_;
+ }
}
}
@@ -1177,13 +1357,11 @@ class DatarateOnePassCbrSvc
unsigned int GetMismatchFrames() { return mismatch_nframes_; }
vpx_codec_pts_t last_pts_;
- int64_t bits_in_buffer_model_;
+ int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
double timebase_;
- int frame_number_;
- vpx_codec_pts_t first_drop_;
- int64_t bits_total_;
+ int64_t bits_total_[VPX_MAX_LAYERS];
double duration_;
- double file_datarate_;
+ double file_datarate_[VPX_MAX_LAYERS];
size_t bits_in_last_frame_;
vpx_svc_extra_cfg_t svc_params_;
int speed_setting_;
@@ -1192,14 +1370,22 @@ class DatarateOnePassCbrSvc
int denoiser_on_;
int tune_content_;
int base_speed_setting_;
+ int spatial_layer_id_;
+ int temporal_layer_id_;
+ int number_spatial_layers_;
+ int number_temporal_layers_;
+ int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
};
static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
const vpx_svc_extra_cfg_t *svc_params,
int spatial_layers, int temporal_layers,
- int temporal_layering_mode) {
+ int temporal_layering_mode,
+ int *layer_target_avg_bandwidth,
+ int64_t *bits_in_buffer_model) {
int sl, spatial_layer_target;
float total = 0;
float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
+ float framerate = 30.0;
for (sl = 0; sl < spatial_layers; ++sl) {
if (svc_params->scaling_factor_den[sl] > 0) {
alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
@@ -1219,10 +1405,43 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
} else if (temporal_layering_mode == 2) {
enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
+ } else if (temporal_layering_mode <= 1) {
+ enc_cfg->layer_target_bitrate[index] = spatial_layer_target;
+ }
+ }
+ for (sl = 0; sl < spatial_layers; ++sl) {
+ for (int tl = 0; tl < temporal_layers; ++tl) {
+ const int layer = sl * temporal_layers + tl;
+ float layer_framerate = framerate;
+ if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
+ if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
+ if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
+ layer_target_avg_bandwidth[layer] = static_cast<int>(
+ enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate);
+ bits_in_buffer_model[layer] =
+ enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz;
}
}
}
+static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg,
+ int number_spatial_layers,
+ int number_temporal_layers,
+ double *file_datarate,
+ double thresh_overshoot,
+ double thresh_undershoot) {
+ for (int sl = 0; sl < number_spatial_layers; ++sl)
+ for (int tl = 0; tl < number_temporal_layers; ++tl) {
+ const int layer = sl * number_temporal_layers + tl;
+ ASSERT_GE(cfg->layer_target_bitrate[layer],
+ file_datarate[layer] * thresh_overshoot)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(cfg->layer_target_bitrate[layer],
+ file_datarate[layer] * thresh_undershoot)
+ << " The datarate for the file is lower than the target by too much!";
+ }
+}
+
// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
// temporal layer, with screen content mode on and same speed setting for all
// layers.
@@ -1246,14 +1465,19 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
svc_params_.scaling_factor_den[1] = 288;
cfg_.rc_dropframe_thresh = 10;
cfg_.kf_max_dist = 9999;
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
cfg_.rc_target_bitrate = 500;
ResetModel();
tune_content_ = 1;
base_speed_setting_ = speed_setting_;
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78, 1.15);
EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
@@ -1281,26 +1505,28 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
svc_params_.scaling_factor_den[1] = 288;
cfg_.rc_dropframe_thresh = 0;
cfg_.kf_max_dist = 9999;
- ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
- 30, 1, 0, 200);
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
// TODO(marpan): Check that effective_datarate for each layer hits the
// layer target_bitrate.
for (int i = 200; i <= 800; i += 200) {
cfg_.rc_target_bitrate = i;
ResetModel();
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
- << " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
- << " The datarate for the file is lower than the target by too much!";
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78,
+ 1.15);
#if CONFIG_VP9_DECODER
// Number of temporal layers > 1, so half of the frames in this SVC pattern
// will be non-reference frame and hence encoder will avoid loopfilter.
- // Since frame dropper is off, we can expcet 100 (half of the sequence)
+ // Since frame dropper is off, we can expect 200 (half of the sequence)
// mismatched frames.
- EXPECT_EQ(static_cast<unsigned int>(100), GetMismatchFrames());
+ EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
#endif
}
}
@@ -1329,33 +1555,41 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
svc_params_.scaling_factor_den[1] = 288;
cfg_.rc_dropframe_thresh = 0;
cfg_.kf_max_dist = 9999;
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
// TODO(marpan): Check that effective_datarate for each layer hits the
// layer target_bitrate.
- for (int i = 600; i <= 1000; i += 200) {
- cfg_.rc_target_bitrate = i;
- ResetModel();
- denoiser_on_ = 1;
- assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
- ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
- << " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
- << " The datarate for the file is lower than the target by too much!";
+ // For SVC, noise_sen = 1 means denoising only the top spatial layer
+ // noise_sen = 2 means denoising the two top spatial layers.
+ for (int noise_sen = 1; noise_sen <= 2; noise_sen++) {
+ for (int i = 600; i <= 1000; i += 200) {
+ cfg_.rc_target_bitrate = i;
+ ResetModel();
+ denoiser_on_ = noise_sen;
+ assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78,
+ 1.15);
#if CONFIG_VP9_DECODER
- // Number of temporal layers > 1, so half of the frames in this SVC pattern
- // will be non-reference frame and hence encoder will avoid loopfilter.
- // Since frame dropper is off, we can expcet 150 (half of the sequence)
- // mismatched frames.
- EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+ // Number of temporal layers > 1, so half of the frames in this SVC
+ // pattern
+ // will be non-reference frame and hence encoder will avoid loopfilter.
+ // Since frame dropper is off, we can expect 200 (half of the sequence)
+ // mismatched frames.
+ EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
#endif
+ }
}
}
// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
cfg_.rc_buf_sz = 1000;
@@ -1376,27 +1610,29 @@ TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
svc_params_.scaling_factor_num[1] = 288;
svc_params_.scaling_factor_den[1] = 288;
cfg_.rc_dropframe_thresh = 10;
- ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
- 30, 1, 0, 200);
cfg_.rc_target_bitrate = 400;
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
// For this 3 temporal layer case, pattern repeats every 4 frames, so choose
// 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
for (int j = 64; j <= 67; j++) {
cfg_.kf_max_dist = j;
ResetModel();
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
- << " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
- << " The datarate for the file is lower than the target by too much!";
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78,
+ 1.15);
}
}
// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
// 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
cfg_.rc_buf_sz = 1000;
@@ -1418,22 +1654,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) {
svc_params_.scaling_factor_den[1] = 288;
cfg_.rc_dropframe_thresh = 0;
cfg_.kf_max_dist = 9999;
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
cfg_.rc_target_bitrate = 800;
ResetModel();
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
- << " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
- << " The datarate for the file is lower than the target by too much!";
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78, 1.15);
#if CONFIG_VP9_DECODER
// Number of temporal layers > 1, so half of the frames in this SVC pattern
// will be non-reference frame and hence encoder will avoid loopfilter.
- // Since frame dropper is off, we can expcet 150 (half of the sequence)
+ // Since frame dropper is off, we can expect 30 (half of the sequence)
// mismatched frames.
- EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+ EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
#endif
}
@@ -1463,22 +1700,24 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
svc_params_.scaling_factor_den[2] = 288;
cfg_.rc_dropframe_thresh = 0;
cfg_.kf_max_dist = 9999;
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
cfg_.rc_target_bitrate = 800;
ResetModel();
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
- << " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
- << " The datarate for the file is lower than the target by too much!";
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78, 1.15);
#if CONFIG_VP9_DECODER
// Number of temporal layers > 1, so half of the frames in this SVC pattern
// will be non-reference frame and hence encoder will avoid loopfilter.
- // Since frame dropper is off, we can expcet 150 (half of the sequence)
+ // Since frame dropper is off, we can expect 200 (half of the sequence)
// mismatched frames.
- EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+ EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
#endif
}
@@ -1507,20 +1746,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
svc_params_.scaling_factor_num[2] = 288;
svc_params_.scaling_factor_den[2] = 288;
cfg_.rc_dropframe_thresh = 10;
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
cfg_.rc_target_bitrate = 800;
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
// For this 3 temporal layer case, pattern repeats every 4 frames, so choose
// 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
for (int j = 32; j <= 35; j++) {
cfg_.kf_max_dist = j;
ResetModel();
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
- << " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
- << " The datarate for the file is lower than the target by too much!";
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78,
+ 1.15);
}
}
@@ -1550,22 +1792,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
svc_params_.scaling_factor_den[2] = 288;
cfg_.rc_dropframe_thresh = 0;
cfg_.kf_max_dist = 9999;
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
+ ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
cfg_.rc_target_bitrate = 800;
ResetModel();
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
- cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+ layer_target_avg_bandwidth_, bits_in_buffer_model_);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
- << " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
- << " The datarate for the file is lower than the target by too much!";
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78, 1.15);
#if CONFIG_VP9_DECODER
// Number of temporal layers > 1, so half of the frames in this SVC pattern
// will be non-reference frame and hence encoder will avoid loopfilter.
- // Since frame dropper is off, we can expcet 150 (half of the sequence)
+ // Since frame dropper is off, we can expect 30 (half of the sequence)
// mismatched frames.
- EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+ EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
#endif
}
@@ -1597,9 +1840,19 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
cfg_.layer_target_bitrate[0] = 300;
cfg_.layer_target_bitrate[1] = 1400;
cfg_.rc_target_bitrate = 1700;
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+ number_spatial_layers_ = cfg_.ss_number_layers;
+ number_temporal_layers_ = cfg_.ts_number_layers;
ResetModel();
+ layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
+ bits_in_buffer_model_[0] =
+ cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
+ layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
+ bits_in_buffer_model_[1] =
+ cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
+ ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+ number_temporal_layers_, file_datarate_, 0.78, 1.15);
EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index 6ea77fde2..ce0bd37b3 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -542,8 +542,8 @@ class Trans16x16TestBase {
const uint32_t diff = dst[j] - src[j];
#endif // CONFIG_VP9_HIGHBITDEPTH
const uint32_t error = diff * diff;
- EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
- << " at index " << j;
+ EXPECT_GE(1u, error)
+ << "Error: 16x16 IDCT has error " << error << " at index " << j;
}
}
}
@@ -744,66 +744,6 @@ TEST_P(InvTrans16x16DCT, CompareReference) {
CompareInvReference(ref_txfm_, thresh_);
}
-class PartialTrans16x16Test : public ::testing::TestWithParam<
- std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
- public:
- virtual ~PartialTrans16x16Test() {}
- virtual void SetUp() {
- fwd_txfm_ = GET_PARAM(0);
- bit_depth_ = GET_PARAM(1);
- }
-
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
- vpx_bit_depth_t bit_depth_;
- FdctFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans16x16Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int16_t maxval =
- static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
- const int16_t maxval = 255;
-#endif
- const int minval = -maxval;
- DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
- for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
- output[0] = 0;
- ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
- EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
-
- for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
- output[0] = 0;
- ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
- EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
-}
-
-TEST_P(PartialTrans16x16Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int16_t maxval =
- static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
- const int16_t maxval = 255;
-#endif
- DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
- ACMRandom rnd(ACMRandom::DeterministicSeed());
-
- int sum = 0;
- for (int i = 0; i < kNumCoeffs; ++i) {
- const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
- input[i] = val;
- sum += val;
- }
- output[0] = 0;
- ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
- EXPECT_EQ(sum >> 1, output[0]);
-}
-
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -836,11 +776,6 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
- C, PartialTrans16x16Test,
- ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
- make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
- make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
#else
INSTANTIATE_TEST_CASE_P(
C, Trans16x16HT,
@@ -849,17 +784,14 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
- ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
- VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
NEON, Trans16x16DCT,
- ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon,
- 0, VPX_BITS_8)));
-#endif
+ ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
+ &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
+#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
@@ -876,9 +808,6 @@ INSTANTIATE_TEST_CASE_P(
2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
- ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
- VPX_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -913,9 +842,6 @@ INSTANTIATE_TEST_CASE_P(
&idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2,
3167, VPX_BITS_12)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
- ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
- VPX_BITS_8)));
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -931,8 +857,12 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
- ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
- VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT,
+ ::testing::Values(make_tuple(&vpx_fdct16x16_c,
+ &vpx_idct16x16_256_add_vsx,
+ 0, VPX_BITS_8)));
+#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/libvpx/test/dct32x32_test.cc b/libvpx/test/dct32x32_test.cc
index d8054c4eb..a95ff9732 100644
--- a/libvpx/test/dct32x32_test.cc
+++ b/libvpx/test/dct32x32_test.cc
@@ -292,67 +292,6 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
}
}
-class PartialTrans32x32Test
- : public ::testing::TestWithParam<
- std::tr1::tuple<FwdTxfmFunc, vpx_bit_depth_t> > {
- public:
- virtual ~PartialTrans32x32Test() {}
- virtual void SetUp() {
- fwd_txfm_ = GET_PARAM(0);
- bit_depth_ = GET_PARAM(1);
- }
-
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
- vpx_bit_depth_t bit_depth_;
- FwdTxfmFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans32x32Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int16_t maxval =
- static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
- const int16_t maxval = 255;
-#endif
- const int minval = -maxval;
- DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
- for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
- output[0] = 0;
- ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
- EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
-
- for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
- output[0] = 0;
- ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
- EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
-}
-
-TEST_P(PartialTrans32x32Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int16_t maxval =
- static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
- const int16_t maxval = 255;
-#endif
- DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
- ACMRandom rnd(ACMRandom::DeterministicSeed());
-
- int sum = 0;
- for (int i = 0; i < kNumCoeffs; ++i) {
- const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
- input[i] = val;
- sum += val;
- }
- output[0] = 0;
- ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
- EXPECT_EQ(sum >> 3, output[0]);
-}
-
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -366,11 +305,6 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1,
VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
- C, PartialTrans32x32Test,
- ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8),
- make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10),
- make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12)));
#else
INSTANTIATE_TEST_CASE_P(
C, Trans32x32Test,
@@ -378,19 +312,16 @@ INSTANTIATE_TEST_CASE_P(
VPX_BITS_8),
make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
- ::testing::Values(make_tuple(&vpx_fdct32x32_1_c,
- VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
NEON, Trans32x32Test,
- ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon,
- 0, VPX_BITS_8),
- make_tuple(&vpx_fdct32x32_rd_c,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+ &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct32x32_rd_neon,
&vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
@@ -399,9 +330,6 @@ INSTANTIATE_TEST_CASE_P(
&vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
make_tuple(&vpx_fdct32x32_rd_sse2,
&vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
- ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
- VPX_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -418,9 +346,6 @@ INSTANTIATE_TEST_CASE_P(
VPX_BITS_8),
make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
- ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
- VPX_BITS_8)));
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -439,8 +364,14 @@ INSTANTIATE_TEST_CASE_P(
&vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
make_tuple(&vpx_fdct32x32_rd_msa,
&vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
- ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa,
- VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+ VSX, Trans32x32Test,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx,
+ 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct32x32_rd_c,
+ &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
+#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/libvpx/test/dct_partial_test.cc b/libvpx/test/dct_partial_test.cc
new file mode 100644
index 000000000..4d145f589
--- /dev/null
+++ b/libvpx/test/dct_partial_test.cc
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+
+namespace {
+typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+
+typedef tuple<PartialFdctFunc, int /* size */, vpx_bit_depth_t>
+ PartialFdctParam;
+
+tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) {
+ int64_t sum = 0;
+ for (int y = 0; y < size; ++y) {
+ for (int x = 0; x < size; ++x) {
+ sum += in.TopLeftPixel()[y * in.stride() + x];
+ }
+ }
+
+ switch (size) {
+ case 4: sum *= 2; break;
+ case 8: /*sum = sum;*/ break;
+ case 16: sum >>= 1; break;
+ case 32: sum >>= 3; break;
+ }
+
+ return static_cast<tran_low_t>(sum);
+}
+
+class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
+ public:
+ PartialFdctTest() {
+ fwd_txfm_ = GET_PARAM(0);
+ size_ = GET_PARAM(1);
+ bit_depth_ = GET_PARAM(2);
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ void RunTest() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int16_t maxvalue =
+ clip_pixel_highbd(std::numeric_limits<int16_t>::max(), bit_depth_);
+ const int16_t minvalue = -maxvalue;
+ Buffer<int16_t> input_block =
+ Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+ ASSERT_TRUE(input_block.Init());
+ Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(output_block.Init());
+
+ for (int i = 0; i < 100; ++i) {
+ if (i == 0) {
+ input_block.Set(maxvalue);
+ } else if (i == 1) {
+ input_block.Set(minvalue);
+ } else {
+ input_block.Set(&rnd, minvalue, maxvalue);
+ }
+
+ ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(),
+ output_block.TopLeftPixel(),
+ input_block.stride()));
+
+ EXPECT_EQ(partial_fdct_ref(input_block, size_),
+ output_block.TopLeftPixel()[0]);
+ }
+ }
+
+ PartialFdctFunc fwd_txfm_;
+ vpx_bit_depth_t bit_depth_;
+ int size_;
+};
+
+TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ C, PartialFdctTest,
+ ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12),
+ make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10),
+ make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12),
+ make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10),
+ make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12),
+ make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10),
+ make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, PartialFdctTest,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, PartialFdctTest,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8)));
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ NEON, PartialFdctTest,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
+ make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
+ make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ NEON, PartialFdctTest,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_NEON
+
+#if HAVE_MSA
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(MSA, PartialFdctTest,
+ ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8,
+ VPX_BITS_8)));
+#else // !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ MSA, PartialFdctTest,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_MSA
+} // namespace
diff --git a/libvpx/test/dct_test.cc b/libvpx/test/dct_test.cc
new file mode 100644
index 000000000..addbdfb46
--- /dev/null
+++ b/libvpx/test/dct_test.cc
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+ int tx_type);
+typedef void (*FhtFuncRef)(const Buffer<int16_t> &in, Buffer<tran_low_t> *out,
+ int size, int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+ int tx_type);
+
+/* forward transform, inverse transform, size, transform type, bit depth */
+typedef tuple<FdctFunc, IdctFunc, int, int, vpx_bit_depth_t> DctParam;
+typedef tuple<FhtFunc, IhtFunc, int, int, vpx_bit_depth_t> HtParam;
+
+void fdct_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+ int /*tx_type*/) {
+ const int16_t *i = in.TopLeftPixel();
+ const int i_stride = in.stride();
+ tran_low_t *o = out->TopLeftPixel();
+ if (size == 4) {
+ vpx_fdct4x4_c(i, o, i_stride);
+ } else if (size == 8) {
+ vpx_fdct8x8_c(i, o, i_stride);
+ } else if (size == 16) {
+ vpx_fdct16x16_c(i, o, i_stride);
+ } else if (size == 32) {
+ vpx_fdct32x32_c(i, o, i_stride);
+ }
+}
+
+void fht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+ int tx_type) {
+ const int16_t *i = in.TopLeftPixel();
+ const int i_stride = in.stride();
+ tran_low_t *o = out->TopLeftPixel();
+ if (size == 4) {
+ vp9_fht4x4_c(i, o, i_stride, tx_type);
+ } else if (size == 8) {
+ vp9_fht8x8_c(i, o, i_stride, tx_type);
+ } else if (size == 16) {
+ vp9_fht16x16_c(i, o, i_stride, tx_type);
+ }
+}
+
+void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+ int /*tx_type*/) {
+ ASSERT_EQ(size, 4);
+ vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define idctNxN(n, coeffs, bitdepth) \
+ void idct##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \
+ int stride) { \
+ vpx_highbd_idct##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \
+ stride, bitdepth); \
+ }
+
+idctNxN(4, 16, 10);
+idctNxN(4, 16, 12);
+idctNxN(8, 64, 10);
+idctNxN(8, 64, 12);
+idctNxN(16, 256, 10);
+idctNxN(16, 256, 12);
+idctNxN(32, 1024, 10);
+idctNxN(32, 1024, 12);
+
+#define ihtNxN(n, coeffs, bitdepth) \
+ void iht##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \
+ int stride, int tx_type) { \
+ vp9_highbd_iht##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \
+ stride, tx_type, bitdepth); \
+ }
+
+ihtNxN(4, 16, 10);
+ihtNxN(4, 16, 12);
+ihtNxN(8, 64, 10);
+ihtNxN(8, 64, 12);
+ihtNxN(16, 256, 10);
+// ihtNxN(16, 256, 12);
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+ vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+ vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+class TransTestBase {
+ public:
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ virtual void RunFwdTxfm(const Buffer<int16_t> &in,
+ Buffer<tran_low_t> *out) = 0;
+
+ virtual void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) = 0;
+
+ void RunAccuracyCheck(int limit) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ Buffer<int16_t> test_input_block =
+ Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+ ASSERT_TRUE(test_input_block.Init());
+ Buffer<tran_low_t> test_temp_block =
+ Buffer<tran_low_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(test_temp_block.Init());
+ Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(dst.Init());
+ Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(src.Init());
+#if CONFIG_VP9_HIGHBITDEPTH
+ Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(dst16.Init());
+ Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(src16.Init());
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ uint32_t max_error = 0;
+ int64_t total_error = 0;
+ const int count_test_block = 10000;
+ for (int i = 0; i < count_test_block; ++i) {
+ if (bit_depth_ == 8) {
+ src.Set(&rnd, &ACMRandom::Rand8);
+ dst.Set(&rnd, &ACMRandom::Rand8);
+ // Initialize a test block with input range [-255, 255].
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+ src.TopLeftPixel()[h * src.stride() + w] -
+ dst.TopLeftPixel()[h * dst.stride() + w];
+ }
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16.Set(&rnd, 0, max_pixel_value_);
+ dst16.Set(&rnd, 0, max_pixel_value_);
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+ src16.TopLeftPixel()[h * src16.stride() + w] -
+ dst16.TopLeftPixel()[h * dst16.stride() + w];
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, dst.TopLeftPixel()));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ int diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (bit_depth_ != 8) {
+ diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
+ src16.TopLeftPixel()[h * src16.stride() + w];
+ } else {
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ diff = dst.TopLeftPixel()[h * dst.stride() + w] -
+ src.TopLeftPixel()[h * src.stride() + w];
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t error = diff * diff;
+ if (max_error < error) max_error = error;
+ total_error += error;
+ }
+ }
+ }
+
+ EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+ << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
+
+ EXPECT_GE(count_test_block * limit, total_error)
+ << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+ << " per block";
+ }
+
+ void RunCoeffCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+ Buffer<int16_t> input_block =
+ Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+ ASSERT_TRUE(input_block.Init());
+ Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+ ASSERT_TRUE(output_ref_block.Init());
+ Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(output_block.Init());
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-max_pixel_value_,
+ // max_pixel_value_].
+ input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_);
+
+ fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_);
+ ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block));
+
+ // The minimum quant value is 4.
+ EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+ if (::testing::Test::HasFailure()) {
+ printf("Size: %d Transform type: %d\n", size_, tx_type_);
+ output_block.PrintDifference(output_ref_block);
+ return;
+ }
+ }
+ }
+
+ void RunMemCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+ Buffer<int16_t> input_extreme_block =
+ Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+ ASSERT_TRUE(input_extreme_block.Init());
+ Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+ ASSERT_TRUE(output_ref_block.Init());
+ Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(output_block.Init());
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with -max_pixel_value_ or max_pixel_value_.
+ if (i == 0) {
+ input_extreme_block.Set(max_pixel_value_);
+ } else if (i == 1) {
+ input_extreme_block.Set(-max_pixel_value_);
+ } else {
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ input_extreme_block
+ .TopLeftPixel()[h * input_extreme_block.stride() + w] =
+ rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_;
+ }
+ }
+ }
+
+ fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_);
+ ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block));
+
+ // The minimum quant value is 4.
+ EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ EXPECT_GE(
+ 4 * DCT_MAX_VALUE << (bit_depth_ - 8),
+ abs(output_block.TopLeftPixel()[h * output_block.stride() + w]))
+ << "Error: 4x4 FDCT has coefficient larger than "
+ "4*DCT_MAX_VALUE"
+ << " at " << w << "," << h;
+ if (::testing::Test::HasFailure()) {
+ printf("Size: %d Transform type: %d\n", size_, tx_type_);
+ output_block.DumpBuffer();
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ void RunInvAccuracyCheck(int limit) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+ Buffer<int16_t> in = Buffer<int16_t>(size_, size_, 4);
+ ASSERT_TRUE(in.Init());
+ Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(coeff.Init());
+ Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(dst.Init());
+ Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0);
+ ASSERT_TRUE(src.Init());
+ Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
+ ASSERT_TRUE(dst16.Init());
+ Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0);
+ ASSERT_TRUE(src16.Init());
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-max_pixel_value_,
+ // max_pixel_value_].
+ if (bit_depth_ == VPX_BITS_8) {
+ src.Set(&rnd, &ACMRandom::Rand8);
+ dst.Set(&rnd, &ACMRandom::Rand8);
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ in.TopLeftPixel()[h * in.stride() + w] =
+ src.TopLeftPixel()[h * src.stride() + w] -
+ dst.TopLeftPixel()[h * dst.stride() + w];
+ }
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16.Set(&rnd, 0, max_pixel_value_);
+ dst16.Set(&rnd, 0, max_pixel_value_);
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ in.TopLeftPixel()[h * in.stride() + w] =
+ src16.TopLeftPixel()[h * src16.stride() + w] -
+ dst16.TopLeftPixel()[h * dst16.stride() + w];
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ fwd_txfm_ref(in, &coeff, size_, tx_type_);
+
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst.TopLeftPixel()));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ for (int h = 0; h < size_; ++h) {
+ for (int w = 0; w < size_; ++w) {
+ int diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (bit_depth_ != 8) {
+ diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
+ src16.TopLeftPixel()[h * src16.stride() + w];
+ } else {
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ diff = dst.TopLeftPixel()[h * dst.stride() + w] -
+ src.TopLeftPixel()[h * src.stride() + w];
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t error = diff * diff;
+ EXPECT_GE(static_cast<uint32_t>(limit), error)
+ << "Error: " << size_ << "x" << size_ << " IDCT has error "
+ << error << " at " << w << "," << h;
+ }
+ }
+ }
+ }
+
+ FhtFuncRef fwd_txfm_ref;
+ vpx_bit_depth_t bit_depth_;
+ int tx_type_;
+ int max_pixel_value_;
+ int size_;
+};
+
+class TransDCT : public TransTestBase,
+ public ::testing::TestWithParam<DctParam> {
+ public:
+ TransDCT() {
+ fwd_txfm_ref = fdct_ref;
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ size_ = GET_PARAM(2);
+ tx_type_ = GET_PARAM(3);
+ bit_depth_ = GET_PARAM(4);
+ max_pixel_value_ = (1 << bit_depth_) - 1;
+ }
+
+ protected:
+ void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+ fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+ }
+
+ void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+ inv_txfm_(in.TopLeftPixel(), out, in.stride());
+ }
+
+ FdctFunc fwd_txfm_;
+ IdctFunc inv_txfm_;
+};
+
+TEST_P(TransDCT, AccuracyCheck) { RunAccuracyCheck(1); }
+
+TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransDCT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ C, TransDCT,
+ ::testing::Values(
+ make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 32, 0, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 32, 0, VPX_BITS_10),
+ make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 16, 0, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 16, 0, VPX_BITS_10),
+ make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 8, 0, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 8, 0, VPX_BITS_10),
+ make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 4, 0, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 4, 0, VPX_BITS_12),
+ make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, TransDCT,
+ ::testing::Values(
+ make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+#if !CONFIG_EMULATE_HARDWARE
+#if CONFIG_VP9_HIGHBITDEPTH
+/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
+ make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 32, 0, VPX_BITS_12),
+ make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 16, 0, VPX_BITS_12),
+*/
+INSTANTIATE_TEST_CASE_P(
+ SSE2, TransDCT,
+ ::testing::Values(
+ make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 32, 0,
+ VPX_BITS_10),
+ make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 32, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 16, 0,
+ VPX_BITS_10),
+ make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 16, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_10, 8, 0, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_12, 8, 0, VPX_BITS_12),
+ make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10, 4, 0, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12, 4, 0, VPX_BITS_12),
+ make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, 0,
+ VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ SSE2, TransDCT,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_sse2,
+ &vpx_idct32x32_1024_add_sse2, 32, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_sse2,
+ &vpx_idct16x16_256_add_sse2, 16, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8,
+ 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4,
+ 0, VPX_BITS_8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_SSE2
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#if !ARCH_X86_64
+// TODO(johannkoenig): high bit depth fdct8x8.
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, TransDCT,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+ 32, 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0,
+ VPX_BITS_8)));
+#else
+// vpx_fdct8x8_ssse3 is only available in 64 bit builds.
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, TransDCT,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+ 32, 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2,
+ 8, 0, VPX_BITS_8)));
+#endif // !ARCH_X86_64
+#endif // HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#endif // !CONFIG_VP9_HIGHBITDEPTH
+
+#if !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
+// TODO(johannkoenig): high bit depth fdct32x32.
+INSTANTIATE_TEST_CASE_P(
+ AVX2, TransDCT, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2,
+ &vpx_idct32x32_1024_add_sse2,
+ 32, 0, VPX_BITS_8)));
+
+#endif // !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
+#if !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+ NEON, TransDCT,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+ &vpx_idct32x32_1024_add_neon, 32, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_neon,
+ &vpx_idct16x16_256_add_neon, 16, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8,
+ 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 4,
+ 0, VPX_BITS_8)));
+#endif // !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_NEON
+
+#if HAVE_MSA
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+ MSA, TransDCT,
+ ::testing::Values(
+ make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 32, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, 16, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 8, 0, VPX_BITS_8),
+ make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 4, 0,
+ VPX_BITS_8)));
+#endif // !CONFIG_EMULATE_HARDWARE
+#endif // !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_MSA
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, TransDCT,
+ ::testing::Values(make_tuple(&vpx_fdct4x4_c,
+ &vpx_idct4x4_16_add_vsx, 4,
+ 0, VPX_BITS_8)));
+#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+class TransHT : public TransTestBase, public ::testing::TestWithParam<HtParam> {
+ public:
+ TransHT() {
+ fwd_txfm_ref = fht_ref;
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ size_ = GET_PARAM(2);
+ tx_type_ = GET_PARAM(3);
+ bit_depth_ = GET_PARAM(4);
+ max_pixel_value_ = (1 << bit_depth_) - 1;
+ }
+
+ protected:
+ void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+ fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_);
+ }
+
+ void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+ inv_txfm_(in.TopLeftPixel(), out, in.stride(), tx_type_);
+ }
+
+ FhtFunc fwd_txfm_;
+ IhtFunc inv_txfm_;
+};
+
+TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
+
+TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 0, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 1, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 2, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 3, VPX_BITS_12),
+ */
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ C, TransHT,
+ ::testing::Values(
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 0, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 1, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 2, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 3, VPX_BITS_10),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 0, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 1, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 2, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 3, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 0, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 1, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 2, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 3, VPX_BITS_12),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 0, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 1, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 2, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 3, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 0, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 1, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 2, VPX_BITS_12),
+ make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 3, VPX_BITS_12),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, TransHT,
+ ::testing::Values(
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
+
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
+
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, TransHT,
+ ::testing::Values(
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 0,
+ VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 1,
+ VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 2,
+ VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 3,
+ VPX_BITS_8),
+
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 3, VPX_BITS_8),
+
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 3,
+ VPX_BITS_8)));
+#endif // HAVE_SSE2
+
+class TransWHT : public TransTestBase,
+ public ::testing::TestWithParam<DctParam> {
+ public:
+ TransWHT() {
+ fwd_txfm_ref = fwht_ref;
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ size_ = GET_PARAM(2);
+ tx_type_ = GET_PARAM(3);
+ bit_depth_ = GET_PARAM(4);
+ max_pixel_value_ = (1 << bit_depth_) - 1;
+ }
+
+ protected:
+ void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+ fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+ }
+
+ void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+ inv_txfm_(in.TopLeftPixel(), out, in.stride());
+ }
+
+ FdctFunc fwd_txfm_;
+ IdctFunc inv_txfm_;
+};
+
+TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); }
+
+TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransWHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ C, TransWHT,
+ ::testing::Values(
+ make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 4, 0, VPX_BITS_10),
+ make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 4, 0, VPX_BITS_12),
+ make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(C, TransWHT,
+ ::testing::Values(make_tuple(&vp9_fwht4x4_c,
+ &vpx_iwht4x4_16_add_c, 4,
+ 0, VPX_BITS_8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
+ ::testing::Values(make_tuple(&vp9_fwht4x4_sse2,
+ &vpx_iwht4x4_16_add_sse2,
+ 4, 0, VPX_BITS_8)));
+#endif // HAVE_SSE2
+} // namespace
diff --git a/libvpx/test/decode_test_driver.cc b/libvpx/test/decode_test_driver.cc
index b738e0db1..48680eb8e 100644
--- a/libvpx/test/decode_test_driver.cc
+++ b/libvpx/test/decode_test_driver.cc
@@ -53,13 +53,13 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
* pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
* frame, which must be a keyframe. */
if (video->frame_number() == 0)
- ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
- << vpx_codec_err_to_string(res_peek);
+ ASSERT_EQ(VPX_CODEC_OK, res_peek)
+ << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
} else {
/* The Vp9 implementation of PeekStream returns an error only if the
* data passed to it isn't a valid Vp9 chunk. */
- ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
- << vpx_codec_err_to_string(res_peek);
+ ASSERT_EQ(VPX_CODEC_OK, res_peek)
+ << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
}
}
diff --git a/libvpx/test/encode_api_test.cc b/libvpx/test/encode_api_test.cc
index f685493aa..87e29b61d 100644
--- a/libvpx/test/encode_api_test.cc
+++ b/libvpx/test/encode_api_test.cc
@@ -79,4 +79,117 @@ TEST(EncodeAPI, HighBitDepthCapability) {
#endif
}
+#if CONFIG_VP8_ENCODER
+TEST(EncodeAPI, ImageSizeSetting) {
+ const int width = 711;
+ const int height = 360;
+ const int bps = 12;
+ vpx_image_t img;
+ vpx_codec_ctx_t enc;
+ vpx_codec_enc_cfg_t cfg;
+ uint8_t *img_buf = reinterpret_cast<uint8_t *>(
+ calloc(width * height * bps / 8, sizeof(*img_buf)));
+ vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0);
+
+ cfg.g_w = width;
+ cfg.g_h = height;
+
+ vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf);
+
+ vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0);
+
+ EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0));
+
+ free(img_buf);
+
+ vpx_codec_destroy(&enc);
+}
+#endif
+
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
+// CONFIG_MULTI_RES_ENCODING.
+TEST(EncodeAPI, MultiResEncode) {
+ static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+ &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+ &vpx_codec_vp9_cx_algo,
+#endif
+ };
+ const int width = 1280;
+ const int height = 720;
+ const int width_down = width / 2;
+ const int height_down = height / 2;
+ const int target_bitrate = 1000;
+ const int framerate = 30;
+
+ for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
+ const vpx_codec_iface_t *const iface = kCodecs[c];
+ vpx_codec_ctx_t enc[2];
+ vpx_codec_enc_cfg_t cfg[2];
+ vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+ memset(enc, 0, sizeof(enc));
+
+ for (int i = 0; i < 2; i++) {
+ vpx_codec_enc_config_default(iface, &cfg[i], 0);
+ }
+
+ /* Highest-resolution encoder settings */
+ cfg[0].g_w = width;
+ cfg[0].g_h = height;
+ cfg[0].rc_dropframe_thresh = 0;
+ cfg[0].rc_end_usage = VPX_CBR;
+ cfg[0].rc_resize_allowed = 0;
+ cfg[0].rc_min_quantizer = 2;
+ cfg[0].rc_max_quantizer = 56;
+ cfg[0].rc_undershoot_pct = 100;
+ cfg[0].rc_overshoot_pct = 15;
+ cfg[0].rc_buf_initial_sz = 500;
+ cfg[0].rc_buf_optimal_sz = 600;
+ cfg[0].rc_buf_sz = 1000;
+ cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+ cfg[0].g_lag_in_frames = 0;
+
+ cfg[0].kf_mode = VPX_KF_AUTO;
+ cfg[0].kf_min_dist = 3000;
+ cfg[0].kf_max_dist = 3000;
+
+ cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+ cfg[0].g_timebase.num = 1; /* Set fps */
+ cfg[0].g_timebase.den = framerate;
+
+ memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+ cfg[1].rc_target_bitrate = 500;
+ cfg[1].g_w = width_down;
+ cfg[1].g_h = height_down;
+
+ for (int i = 0; i < 2; i++) {
+ cfg[i].ts_number_layers = 2;
+ cfg[i].ts_periodicity = 2;
+ cfg[i].ts_rate_decimator[0] = 2;
+ cfg[i].ts_rate_decimator[1] = 1;
+ cfg[i].ts_layer_id[0] = 0;
+ cfg[i].ts_layer_id[1] = 1;
+ // Invalid parameters.
+ cfg[i].ts_target_bitrate[0] = 0;
+ cfg[i].ts_target_bitrate[1] = 0;
+ }
+
+ // VP9 should report incapable, VP8 invalid for all configurations.
+ const char kVP9Name[] = "WebM Project VP9";
+ const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
+ sizeof(kVP9Name) - 1) == 0;
+ EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+ vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
+
+ for (int i = 0; i < 2; i++) {
+ vpx_codec_destroy(&enc[i]);
+ }
+ }
+}
+
} // namespace
diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc
index 5d2b4008a..b2cbc3f05 100644
--- a/libvpx/test/encode_test_driver.cc
+++ b/libvpx/test/encode_test_driver.cc
@@ -201,6 +201,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
PreEncodeFrameHook(video, encoder.get());
encoder->EncodeFrame(video, frame_flags_);
+ PostEncodeFrameHook(encoder.get());
+
CxDataIterator iter = encoder->GetCxData();
bool has_cxdata = false;
diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 08a57ad77..89a3b1767 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h
@@ -139,6 +139,13 @@ class Encoder {
}
#endif
+#if CONFIG_VP8_ENCODER
+ void Control(int ctrl_id, vpx_roi_map_t *arg) {
+ const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+ }
+#endif
+
void Config(const vpx_codec_enc_cfg_t *cfg) {
const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -212,6 +219,8 @@ class EncoderTest {
virtual void PreEncodeFrameHook(VideoSource * /*video*/,
Encoder * /*encoder*/) {}
+ virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
+
// Hook to be called on every compressed data packet.
virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
diff --git a/libvpx/test/external_frame_buffer_test.cc b/libvpx/test/external_frame_buffer_test.cc
index f9686695a..dbf297119 100644
--- a/libvpx/test/external_frame_buffer_test.cc
+++ b/libvpx/test/external_frame_buffer_test.cc
@@ -34,7 +34,8 @@ struct ExternalFrameBuffer {
// Class to manipulate a list of external frame buffers.
class ExternalFrameBufferList {
public:
- ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {}
+ ExternalFrameBufferList()
+ : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}
virtual ~ExternalFrameBufferList() {
for (int i = 0; i < num_buffers_; ++i) {
@@ -71,6 +72,8 @@ class ExternalFrameBufferList {
}
SetFrameBuffer(idx, fb);
+
+ num_used_buffers_++;
return 0;
}
@@ -106,6 +109,7 @@ class ExternalFrameBufferList {
}
EXPECT_EQ(1, ext_fb->in_use);
ext_fb->in_use = 0;
+ num_used_buffers_--;
return 0;
}
@@ -121,6 +125,8 @@ class ExternalFrameBufferList {
}
}
+ int num_used_buffers() const { return num_used_buffers_; }
+
private:
// Returns the index of the first free frame buffer. Returns |num_buffers_|
// if there are no free frame buffers.
@@ -145,6 +151,7 @@ class ExternalFrameBufferList {
}
int num_buffers_;
+ int num_used_buffers_;
ExternalFrameBuffer *ext_fb_list_;
};
@@ -220,8 +227,8 @@ class ExternalFrameBufferMD5Test
void OpenMD5File(const std::string &md5_file_name_) {
md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
- ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
- << md5_file_name_;
+ ASSERT_TRUE(md5_file_ != NULL)
+ << "Md5 file open failed. Filename: " << md5_file_name_;
}
virtual void DecompressedFrameHook(const vpx_image_t &img,
@@ -273,6 +280,7 @@ class ExternalFrameBufferMD5Test
#if CONFIG_WEBM_IO
const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm";
// Class for testing passing in external frame buffers to libvpx.
class ExternalFrameBufferTest : public ::testing::Test {
@@ -292,7 +300,9 @@ class ExternalFrameBufferTest : public ::testing::Test {
virtual void TearDown() {
delete decoder_;
+ decoder_ = NULL;
delete video_;
+ video_ = NULL;
}
// Passes the external frame buffer information to libvpx.
@@ -325,7 +335,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
return VPX_CODEC_OK;
}
- private:
+ protected:
void CheckDecodedFrames() {
libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
const vpx_image_t *img = NULL;
@@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test {
int num_buffers_;
ExternalFrameBufferList fb_list_;
};
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+ virtual void SetUp() {
+ video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
+ ASSERT_TRUE(video_ != NULL);
+ video_->Init();
+ video_->Begin();
+
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+ decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+ ASSERT_TRUE(decoder_ != NULL);
+ }
+
+ virtual void CheckFrameBufferRelease() {
+ TearDown();
+ ASSERT_EQ(0, fb_list_.num_used_buffers());
+ }
+};
#endif // CONFIG_WEBM_IO
// This test runs through the set of test vectors, and decodes them.
@@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
release_vp9_frame_buffer));
ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+ // Only run this on long clips. Decoding a very short clip will return
+ // VPX_CODEC_OK even with only 2 buffers.
ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
}
@@ -467,6 +498,15 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
release_vp9_frame_buffer));
}
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+ const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(VPX_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+ release_vp9_frame_buffer));
+ ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+ CheckFrameBufferRelease();
+}
#endif // CONFIG_WEBM_IO
VP9_INSTANTIATE_TEST_CASE(
diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc
deleted file mode 100644
index aa90bfa18..000000000
--- a/libvpx/test/fdct4x4_test.cc
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vpx/vpx_codec.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
- int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
- int tx_type);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
-
-void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
- int /*tx_type*/) {
- vpx_fdct4x4_c(in, out, stride);
-}
-
-void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
- vp9_fht4x4_c(in, out, stride, tx_type);
-}
-
-void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
- int /*tx_type*/) {
- vp9_fwht4x4_c(in, out, stride);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
-}
-
-void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
-}
-
-void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
-}
-
-void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
-}
-
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
-}
-
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
-}
-
-#if HAVE_SSE2
-void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
-}
-
-void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
-}
-#endif // HAVE_SSE2
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-class Trans4x4TestBase {
- public:
- virtual ~Trans4x4TestBase() {}
-
- protected:
- virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
- virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
- void RunAccuracyCheck(int limit) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- uint32_t max_error = 0;
- int64_t total_error = 0;
- const int count_test_block = 10000;
- for (int i = 0; i < count_test_block; ++i) {
- DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
- DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < kNumCoeffs; ++j) {
- if (bit_depth_ == VPX_BITS_8) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- test_input_block[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
- } else {
- src16[j] = rnd.Rand16() & mask_;
- dst16[j] = rnd.Rand16() & mask_;
- test_input_block[j] = src16[j] - dst16[j];
-#endif
- }
- }
-
- ASM_REGISTER_STATE_CHECK(
- RunFwdTxfm(test_input_block, test_temp_block, pitch_));
- if (bit_depth_ == VPX_BITS_8) {
- ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
- } else {
- ASM_REGISTER_STATE_CHECK(
- RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
-#endif
- }
-
- for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int diff =
- bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
- ASSERT_EQ(VPX_BITS_8, bit_depth_);
- const int diff = dst[j] - src[j];
-#endif
- const uint32_t error = diff * diff;
- if (max_error < error) max_error = error;
- total_error += error;
- }
- }
-
- EXPECT_GE(static_cast<uint32_t>(limit), max_error)
- << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
-
- EXPECT_GE(count_test_block * limit, total_error)
- << "Error: 4x4 FHT/IHT has average round trip error > " << limit
- << " per block";
- }
-
- void RunCoeffCheck() {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 5000;
- DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
- for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-mask_, mask_].
- for (int j = 0; j < kNumCoeffs; ++j) {
- input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
- }
-
- fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
- ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
- // The minimum quant value is 4.
- for (int j = 0; j < kNumCoeffs; ++j)
- EXPECT_EQ(output_block[j], output_ref_block[j]);
- }
- }
-
- void RunMemCheck() {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 5000;
- DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
- for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-mask_, mask_].
- for (int j = 0; j < kNumCoeffs; ++j) {
- input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
- }
- if (i == 0) {
- for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
- } else if (i == 1) {
- for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
- }
-
- fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
- ASM_REGISTER_STATE_CHECK(
- RunFwdTxfm(input_extreme_block, output_block, pitch_));
-
- // The minimum quant value is 4.
- for (int j = 0; j < kNumCoeffs; ++j) {
- EXPECT_EQ(output_block[j], output_ref_block[j]);
- EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
- << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
- }
- }
- }
-
- void RunInvAccuracyCheck(int limit) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 1000;
- DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
- DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
- for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-mask_, mask_].
- for (int j = 0; j < kNumCoeffs; ++j) {
- if (bit_depth_ == VPX_BITS_8) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- in[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
- } else {
- src16[j] = rnd.Rand16() & mask_;
- dst16[j] = rnd.Rand16() & mask_;
- in[j] = src16[j] - dst16[j];
-#endif
- }
- }
-
- fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
- if (bit_depth_ == VPX_BITS_8) {
- ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
- } else {
- ASM_REGISTER_STATE_CHECK(
- RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
-#endif
- }
-
- for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int diff =
- bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
- const int diff = dst[j] - src[j];
-#endif
- const uint32_t error = diff * diff;
- EXPECT_GE(static_cast<uint32_t>(limit), error)
- << "Error: 4x4 IDCT has error " << error << " at index " << j;
- }
- }
- }
-
- int pitch_;
- int tx_type_;
- FhtFunc fwd_txfm_ref;
- vpx_bit_depth_t bit_depth_;
- int mask_;
-};
-
-class Trans4x4DCT : public Trans4x4TestBase,
- public ::testing::TestWithParam<Dct4x4Param> {
- public:
- virtual ~Trans4x4DCT() {}
-
- virtual void SetUp() {
- fwd_txfm_ = GET_PARAM(0);
- inv_txfm_ = GET_PARAM(1);
- tx_type_ = GET_PARAM(2);
- pitch_ = 4;
- fwd_txfm_ref = fdct4x4_ref;
- bit_depth_ = GET_PARAM(3);
- mask_ = (1 << bit_depth_) - 1;
- }
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
- void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
- fwd_txfm_(in, out, stride);
- }
- void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
- inv_txfm_(out, dst, stride);
- }
-
- FdctFunc fwd_txfm_;
- IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4HT : public Trans4x4TestBase,
- public ::testing::TestWithParam<Ht4x4Param> {
- public:
- virtual ~Trans4x4HT() {}
-
- virtual void SetUp() {
- fwd_txfm_ = GET_PARAM(0);
- inv_txfm_ = GET_PARAM(1);
- tx_type_ = GET_PARAM(2);
- pitch_ = 4;
- fwd_txfm_ref = fht4x4_ref;
- bit_depth_ = GET_PARAM(3);
- mask_ = (1 << bit_depth_) - 1;
- }
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
- void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
- fwd_txfm_(in, out, stride, tx_type_);
- }
-
- void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
- inv_txfm_(out, dst, stride, tx_type_);
- }
-
- FhtFunc fwd_txfm_;
- IhtFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4WHT : public Trans4x4TestBase,
- public ::testing::TestWithParam<Dct4x4Param> {
- public:
- virtual ~Trans4x4WHT() {}
-
- virtual void SetUp() {
- fwd_txfm_ = GET_PARAM(0);
- inv_txfm_ = GET_PARAM(1);
- tx_type_ = GET_PARAM(2);
- pitch_ = 4;
- fwd_txfm_ref = fwht4x4_ref;
- bit_depth_ = GET_PARAM(3);
- mask_ = (1 << bit_depth_) - 1;
- }
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
- void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
- fwd_txfm_(in, out, stride);
- }
- void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
- inv_txfm_(out, dst, stride);
- }
-
- FdctFunc fwd_txfm_;
- IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); }
-
-TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-using std::tr1::make_tuple;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- C, Trans4x4DCT,
- ::testing::Values(
- make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
- make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
- make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
- ::testing::Values(make_tuple(&vpx_fdct4x4_c,
- &vpx_idct4x4_16_add_c, 0,
- VPX_BITS_8)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- C, Trans4x4HT,
- ::testing::Values(
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
- make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
- C, Trans4x4HT,
- ::testing::Values(
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- C, Trans4x4WHT,
- ::testing::Values(
- make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
- make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
- make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
- ::testing::Values(make_tuple(&vp9_fwht4x4_c,
- &vpx_iwht4x4_16_add_c, 0,
- VPX_BITS_8)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
- ::testing::Values(make_tuple(&vpx_fdct4x4_neon,
- &vpx_idct4x4_16_add_neon,
- 0, VPX_BITS_8)));
-#if !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- NEON, Trans4x4HT,
- ::testing::Values(
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif // !CONFIG_VP9_HIGHBITDEPTH
-#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
- SSE2, Trans4x4WHT,
- ::testing::Values(
- make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
- make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
-#endif
-
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
- ::testing::Values(make_tuple(&vpx_fdct4x4_sse2,
- &vpx_idct4x4_16_add_sse2,
- 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
- SSE2, Trans4x4HT,
- ::testing::Values(
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
-#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
- SSE2, Trans4x4DCT,
- ::testing::Values(
- make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
- make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
- make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
- make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
- make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-
-INSTANTIATE_TEST_CASE_P(
- SSE2, Trans4x4HT,
- ::testing::Values(
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
- ::testing::Values(make_tuple(&vpx_fdct4x4_msa,
- &vpx_idct4x4_16_add_msa, 0,
- VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
- MSA, Trans4x4HT,
- ::testing::Values(
- make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
- make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
-#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-} // namespace
diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc
index dfbb5dc3d..5021dda9b 100644
--- a/libvpx/test/fdct8x8_test.cc
+++ b/libvpx/test/fdct8x8_test.cc
@@ -511,8 +511,8 @@ class FwdTrans8x8TestBase {
const int diff = dst[j] - ref[j];
#endif
const uint32_t error = diff * diff;
- EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error
- << " at index " << j;
+ EXPECT_EQ(0u, error)
+ << "Error: 8x8 IDCT has error " << error << " at index " << j;
}
}
}
@@ -739,7 +739,7 @@ INSTANTIATE_TEST_CASE_P(
!CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
::testing::Values(make_tuple(&vpx_fdct8x8_ssse3,
- &vpx_idct8x8_64_add_ssse3,
+ &vpx_idct8x8_64_add_sse2,
0, VPX_BITS_8)));
#endif
@@ -756,4 +756,11 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, FwdTrans8x8DCT,
+ ::testing::Values(make_tuple(&vpx_fdct8x8_c,
+ &vpx_idct8x8_64_add_vsx, 0,
+ VPX_BITS_8)));
+#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/libvpx/test/hadamard_test.cc b/libvpx/test/hadamard_test.cc
index a55b15ad0..3b7cfeddc 100644
--- a/libvpx/test/hadamard_test.cc
+++ b/libvpx/test/hadamard_test.cc
@@ -22,7 +22,8 @@ namespace {
using ::libvpx_test::ACMRandom;
-typedef void (*HadamardFunc)(const int16_t *a, int a_stride, tran_low_t *b);
+typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
+ tran_low_t *b);
void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
int16_t b[8];
@@ -268,6 +269,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_sse2));
#endif // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test,
+ ::testing::Values(&vpx_hadamard_16x16_avx2));
+#endif // HAVE_AVX2
+
#if HAVE_VSX
INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_vsx));
diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc
index 084b2ed0c..3700374d7 100644
--- a/libvpx/test/idct_test.cc
+++ b/libvpx/test/idct_test.cc
@@ -30,12 +30,15 @@ class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
virtual void SetUp() {
UUT = GetParam();
- input = new (std::nothrow) Buffer<int16_t>(4, 4, 0);
+ input = new Buffer<int16_t>(4, 4, 0);
ASSERT_TRUE(input != NULL);
- predict = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
+ ASSERT_TRUE(input->Init());
+ predict = new Buffer<uint8_t>(4, 4, 3);
ASSERT_TRUE(predict != NULL);
- output = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
+ ASSERT_TRUE(predict->Init());
+ output = new Buffer<uint8_t>(4, 4, 3);
ASSERT_TRUE(output != NULL);
+ ASSERT_TRUE(output->Init());
}
virtual void TearDown() {
@@ -166,4 +169,9 @@ INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
::testing::Values(vp8_short_idct4x4llm_msa));
#endif // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
+ ::testing::Values(vp8_short_idct4x4llm_mmi));
+#endif // HAVE_MMI
}
diff --git a/libvpx/test/invalid_file_test.cc b/libvpx/test/invalid_file_test.cc
index eae81faa1..79220b0f6 100644
--- a/libvpx/test/invalid_file_test.cc
+++ b/libvpx/test/invalid_file_test.cc
@@ -45,8 +45,8 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
void OpenResFile(const std::string &res_file_name_) {
res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
- ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
- << res_file_name_;
+ ASSERT_TRUE(res_file_ != NULL)
+ << "Result file open failed. Filename: " << res_file_name_;
}
virtual bool HandleDecodeResult(
@@ -120,11 +120,23 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
+#if CONFIG_VP8_DECODER
+const DecodeParam kVP8InvalidFileTests[] = {
+ { 1, "invalid-bug-1443.ivf" },
+};
+
+VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
+ ::testing::ValuesIn(kVP8InvalidFileTests));
+#endif // CONFIG_VP8_DECODER
+
#if CONFIG_VP9_DECODER
const DecodeParam kVP9InvalidFileTests[] = {
{ 1, "invalid-vp90-02-v2.webm" },
#if CONFIG_VP9_HIGHBITDEPTH
{ 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" },
+ { 1,
+ "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-."
+ "ivf" },
#endif
{ 1, "invalid-vp90-03-v3.webm" },
{ 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" },
@@ -164,12 +176,12 @@ class InvalidFileInvalidPeekTest : public InvalidFileTest {
TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }
#if CONFIG_VP8_DECODER
-const DecodeParam kVP8InvalidFileTests[] = {
+const DecodeParam kVP8InvalidPeekTests[] = {
{ 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" },
};
VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
- ::testing::ValuesIn(kVP8InvalidFileTests));
+ ::testing::ValuesIn(kVP8InvalidPeekTests));
#endif // CONFIG_VP8_DECODER
#if CONFIG_VP9_DECODER
diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h
index b87624a11..5862d2649 100644
--- a/libvpx/test/ivf_video_source.h
+++ b/libvpx/test/ivf_video_source.h
@@ -47,8 +47,8 @@ class IVFVideoSource : public CompressedVideoSource {
virtual void Begin() {
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
- << file_name_;
+ ASSERT_TRUE(input_file_ != NULL)
+ << "Input file open failed. Filename: " << file_name_;
// Read file header
uint8_t file_hdr[kIvfFileHdrSize];
diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc
index 38bd923b7..ee75f401c 100644
--- a/libvpx/test/keyframe_test.cc
+++ b/libvpx/test/keyframe_test.cc
@@ -135,8 +135,8 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
iter != kf_pts_list_.end(); ++iter) {
if (deadline_ == VPX_DL_REALTIME && *iter > 0)
- EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
- << *iter;
+ EXPECT_EQ(0, (*iter - 1) % 30)
+ << "Unexpected keyframe at frame " << *iter;
else
EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
}
diff --git a/libvpx/test/level_test.cc b/libvpx/test/level_test.cc
index 85097e94b..26935a81b 100644
--- a/libvpx/test/level_test.cc
+++ b/libvpx/test/level_test.cc
@@ -73,7 +73,7 @@ TEST_P(LevelTest, TestTargetLevel11Large) {
target_level_ = 11;
cfg_.rc_target_bitrate = 150;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_EQ(target_level_, level_);
+ ASSERT_GE(target_level_, level_);
}
TEST_P(LevelTest, TestTargetLevel20Large) {
@@ -83,7 +83,7 @@ TEST_P(LevelTest, TestTargetLevel20Large) {
target_level_ = 20;
cfg_.rc_target_bitrate = 1200;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_EQ(target_level_, level_);
+ ASSERT_GE(target_level_, level_);
}
TEST_P(LevelTest, TestTargetLevel31Large) {
@@ -93,7 +93,7 @@ TEST_P(LevelTest, TestTargetLevel31Large) {
target_level_ = 31;
cfg_.rc_target_bitrate = 8000;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_EQ(target_level_, level_);
+ ASSERT_GE(target_level_, level_);
}
// Test for keeping level stats only
@@ -103,11 +103,11 @@ TEST_P(LevelTest, TestTargetLevel0) {
target_level_ = 0;
min_gf_internal_ = 4;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_EQ(11, level_);
+ ASSERT_GE(11, level_);
cfg_.rc_target_bitrate = 1600;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_EQ(20, level_);
+ ASSERT_GE(20, level_);
}
// Test for level control being turned off
@@ -130,7 +130,7 @@ TEST_P(LevelTest, TestTargetLevelApi) {
if (level == 10 || level == 11 || level == 20 || level == 21 ||
level == 30 || level == 31 || level == 40 || level == 41 ||
level == 50 || level == 51 || level == 52 || level == 60 ||
- level == 61 || level == 62 || level == 0 || level == 255)
+ level == 61 || level == 62 || level == 0 || level == 1 || level == 255)
EXPECT_EQ(VPX_CODEC_OK,
vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
else
diff --git a/libvpx/test/lpf_test.cc b/libvpx/test/lpf_test.cc
index 4fca7d49c..e04b996cd 100644
--- a/libvpx/test/lpf_test.cc
+++ b/libvpx/test/lpf_test.cc
@@ -114,6 +114,18 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
}
}
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+ return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+ return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+ return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
+}
+
class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
public:
virtual ~Loop8Test6Param() {}
@@ -162,15 +174,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
int first_failure = -1;
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
- uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+ uint8_t tmp = GetOuterThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+ tmp = GetInnerThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = rnd.Rand8();
+ tmp = GetHevThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -221,15 +233,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
- uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+ uint8_t tmp = GetOuterThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+ tmp = GetInnerThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = rnd.Rand8();
+ tmp = GetHevThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -271,27 +283,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
int first_failure = -1;
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
- uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+ uint8_t tmp = GetOuterThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+ tmp = GetInnerThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = rnd.Rand8();
+ tmp = GetHevThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+ tmp = GetOuterThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+ tmp = GetInnerThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = rnd.Rand8();
+ tmp = GetHevThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -334,27 +346,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
int first_failure = -1;
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
- uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+ uint8_t tmp = GetOuterThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+ tmp = GetInnerThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = rnd.Rand8();
+ tmp = GetHevThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+ tmp = GetOuterThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+ tmp = GetInnerThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
- tmp = rnd.Rand8();
+ tmp = GetHevThresh(&rnd);
DECLARE_ALIGNED(16, const uint8_t,
thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
diff --git a/libvpx/test/minmax_test.cc b/libvpx/test/minmax_test.cc
index e5c93ed7d..9c119116a 100644
--- a/libvpx/test/minmax_test.cc
+++ b/libvpx/test/minmax_test.cc
@@ -107,10 +107,10 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
int min_ref, max_ref, min, max;
reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
- EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
- << " and b_stride = " << b_stride;
- EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
- << " and b_stride = " << b_stride;
+ EXPECT_EQ(max_ref, max)
+ << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+ EXPECT_EQ(min_ref, min)
+ << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
}
}
}
diff --git a/libvpx/test/partial_idct_test.cc b/libvpx/test/partial_idct_test.cc
index 740d7e202..f7b50f53a 100644
--- a/libvpx/test/partial_idct_test.cc
+++ b/libvpx/test/partial_idct_test.cc
@@ -62,9 +62,9 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
virtual ~PartialIDctTest() {}
virtual void SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
- ftxfm_ = GET_PARAM(0);
- full_itxfm_ = GET_PARAM(1);
- partial_itxfm_ = GET_PARAM(2);
+ fwd_txfm_ = GET_PARAM(0);
+ full_inv_txfm_ = GET_PARAM(1);
+ partial_inv_txfm_ = GET_PARAM(2);
tx_size_ = GET_PARAM(3);
last_nonzero_ = GET_PARAM(4);
bit_depth_ = GET_PARAM(5);
@@ -128,12 +128,12 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
}
void InitInput() {
- const int max_coeff = (32766 << (bit_depth_ - 8)) / 4;
- int max_energy_leftover = max_coeff * max_coeff;
+ const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4;
+ int64_t max_energy_leftover = max_coeff * max_coeff;
for (int j = 0; j < last_nonzero_; ++j) {
tran_low_t coeff = static_cast<tran_low_t>(
sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536);
- max_energy_leftover -= coeff * coeff;
+ max_energy_leftover -= static_cast<int64_t>(coeff) * coeff;
if (max_energy_leftover < 0) {
max_energy_leftover = 0;
coeff = 0;
@@ -161,6 +161,14 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
}
}
}
+
+ printf("\ninput_block_:\n");
+ for (int y = 0; y < size_; y++) {
+ for (int x = 0; x < size_; x++) {
+ printf("%6d,", input_block_[y * size_ + x]);
+ }
+ printf("\n");
+ }
}
}
@@ -177,9 +185,9 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
int output_block_size_;
int bit_depth_;
int mask_;
- FwdTxfmFunc ftxfm_;
- InvTxfmWithBdFunc full_itxfm_;
- InvTxfmWithBdFunc partial_itxfm_;
+ FwdTxfmFunc fwd_txfm_;
+ InvTxfmWithBdFunc full_inv_txfm_;
+ InvTxfmWithBdFunc partial_inv_txfm_;
ACMRandom rnd_;
};
@@ -213,7 +221,7 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
}
}
- ftxfm_(input_extreme_block, output_ref_block, size_);
+ fwd_txfm_(input_extreme_block, output_ref_block, size_);
// quantization with minimum allowed step sizes
input_block_[0] = (output_ref_block[0] / 4) * 4;
@@ -223,9 +231,9 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
}
ASM_REGISTER_STATE_CHECK(
- full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
ASM_REGISTER_STATE_CHECK(
- partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
pixel_size_ * output_block_size_))
<< "Error: partial inverse transform produces different results";
@@ -238,9 +246,9 @@ TEST_P(PartialIDctTest, ResultsMatch) {
InitInput();
ASM_REGISTER_STATE_CHECK(
- full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
ASM_REGISTER_STATE_CHECK(
- partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
pixel_size_ * output_block_size_))
<< "Error: partial inverse transform produces different results";
@@ -255,9 +263,9 @@ TEST_P(PartialIDctTest, AddOutputBlock) {
}
ASM_REGISTER_STATE_CHECK(
- full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
ASM_REGISTER_STATE_CHECK(
- partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
pixel_size_ * output_block_size_))
<< "Error: Transform results are not correctly added to output.";
@@ -278,9 +286,9 @@ TEST_P(PartialIDctTest, SingleExtremeCoeff) {
input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff;
ASM_REGISTER_STATE_CHECK(
- full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
ASM_REGISTER_STATE_CHECK(
- partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
pixel_size_ * output_block_size_))
<< "Error: Fails with single coeff of " << coeff << " at " << i
@@ -297,12 +305,12 @@ TEST_P(PartialIDctTest, DISABLED_Speed) {
for (int i = 0; i < kCountSpeedTestBlock; ++i) {
ASM_REGISTER_STATE_CHECK(
- full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
}
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
for (int i = 0; i < kCountSpeedTestBlock; ++i) {
- partial_itxfm_(input_block_, output_block_, stride_, bit_depth_);
+ partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_);
}
libvpx_test::ClearSystemState();
vpx_usec_timer_mark(&timer);
@@ -469,7 +477,9 @@ const PartialInvTxfmParam c_partial_idct_tests[] = {
INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
::testing::ValuesIn(c_partial_idct_tests));
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
const PartialInvTxfmParam neon_partial_idct_tests[] = {
#if CONFIG_VP9_HIGHBITDEPTH
make_tuple(&vpx_highbd_fdct32x32_c,
@@ -617,12 +627,42 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = {
INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
::testing::ValuesIn(neon_partial_idct_tests));
-#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_NEON
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSE2
// 32x32_135_ is implemented using the 1024 version.
const PartialInvTxfmParam sse2_partial_idct_tests[] = {
#if CONFIG_VP9_HIGHBITDEPTH
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+ 1024, 8, 2),
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+ 1024, 10, 2),
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+ 1024, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 12, 2),
make_tuple(
&vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
&highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 2),
@@ -642,6 +682,15 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
&highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 12, 2),
+ make_tuple(
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
&highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
make_tuple(
@@ -701,12 +750,16 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
#endif // CONFIG_VP9_HIGHBITDEPTH
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
&wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
+ make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
+ &wrapper<vpx_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 1),
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
&wrapper<vpx_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1),
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
&wrapper<vpx_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1),
make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
&wrapper<vpx_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1),
+ make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>,
+ &wrapper<vpx_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 1),
make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
&wrapper<vpx_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1),
make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
@@ -726,27 +779,121 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
::testing::ValuesIn(sse2_partial_idct_tests));
-#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_SSE2
-#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3
const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
- make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
- &wrapper<vpx_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
&wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
&wrapper<vpx_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1),
- make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
- &wrapper<vpx_idct8x8_64_add_ssse3>, TX_8X8, 64, 8, 1),
make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
&wrapper<vpx_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1)
};
INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
::testing::ValuesIn(ssse3_partial_idct_tests));
-#endif // HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_SSSE3
-#if HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam sse4_1_partial_idct_tests[] = {
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+ 1024, 8, 2),
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+ 1024, 10, 2),
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+ 1024, 12, 2),
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+ 135, 8, 2),
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+ 135, 10, 2),
+ make_tuple(&vpx_highbd_fdct32x32_c,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+ 135, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+ &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 12, 2),
+ make_tuple(&vpx_highbd_fdct16x16_c,
+ &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+ 256, 8, 2),
+ make_tuple(&vpx_highbd_fdct16x16_c,
+ &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+ 256, 10, 2),
+ make_tuple(&vpx_highbd_fdct16x16_c,
+ &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+ 256, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 12, 2)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest,
+ ::testing::ValuesIn(sse4_1_partial_idct_tests));
+#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
&wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
@@ -774,9 +921,9 @@ const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
::testing::ValuesIn(dspr2_partial_idct_tests));
-#endif // HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
// 32x32_135_ is implemented using the 1024 version.
const PartialInvTxfmParam msa_partial_idct_tests[] = {
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
@@ -805,6 +952,8 @@ const PartialInvTxfmParam msa_partial_idct_tests[] = {
INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
::testing::ValuesIn(msa_partial_idct_tests));
-#endif // HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#endif // !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc
index 95da09c31..5a2ade1ef 100644
--- a/libvpx/test/pp_filter_test.cc
+++ b/libvpx/test/pp_filter_test.cc
@@ -57,12 +57,14 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
// 5-tap filter needs 2 padding rows above and below the block in the input.
Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+ ASSERT_TRUE(src_image.Init());
// Filter extends output block by 8 samples at left and right edges.
// Though the left padding is only 8 bytes, the assembly code tries to
// read 16 bytes before the pointer.
Buffer<uint8_t> dst_image =
Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+ ASSERT_TRUE(dst_image.Init());
uint8_t *const flimits =
reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
@@ -88,8 +90,8 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
uint8_t *pixel_ptr = dst_image.TopLeftPixel();
for (int i = 0; i < block_height; ++i) {
for (int j = 0; j < block_width; ++j) {
- ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) << "at (" << i << ", " << j
- << ")";
+ ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j])
+ << "at (" << i << ", " << j << ")";
}
pixel_ptr += dst_image.stride();
}
@@ -108,6 +110,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
// SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
Buffer<uint8_t> src_image =
Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2);
+ ASSERT_TRUE(src_image.Init());
// Filter extends output block by 8 samples at left and right edges.
// Though the left padding is only 8 bytes, there is 'above' padding as well
@@ -116,7 +119,9 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
// SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
Buffer<uint8_t> dst_image =
Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8);
+ ASSERT_TRUE(dst_image.Init());
Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8);
+ ASSERT_TRUE(dst_image_ref.Init());
// Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
// can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
@@ -177,8 +182,8 @@ class VpxMbPostProcAcrossIpTest
int rows, int cols, int src_pitch) {
for (int r = 0; r < rows; r++) {
for (int c = 0; c < cols; c++) {
- ASSERT_EQ(expected_output[c], src_c[c]) << "at (" << r << ", " << c
- << ")";
+ ASSERT_EQ(expected_output[c], src_c[c])
+ << "at (" << r << ", " << c << ")";
}
src_c += src_pitch;
}
@@ -197,10 +202,12 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
const int cols = 16;
Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(src.Init());
src.SetPadding(10);
SetCols(src.TopLeftPixel(), rows, cols, src.stride());
Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0);
+ ASSERT_TRUE(expected_output.Init());
SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride());
RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0),
@@ -212,6 +219,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
const int cols = 16;
Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(src.Init());
src.SetPadding(10);
SetCols(src.TopLeftPixel(), rows, cols, src.stride());
@@ -228,6 +236,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
const int cols = 16;
Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(src.Init());
src.SetPadding(10);
SetCols(src.TopLeftPixel(), rows, cols, src.stride());
@@ -249,7 +258,9 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
const int cols = 16;
Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(c_mem.Init());
Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(asm_mem.Init());
// When level >= 100, the filter behaves the same as the level = INT_MAX
// When level < 20, it behaves the same as the level = 0
@@ -285,8 +296,8 @@ class VpxMbPostProcDownTest
int rows, int cols, int src_pitch) {
for (int r = 0; r < rows; r++) {
for (int c = 0; c < cols; c++) {
- ASSERT_EQ(expected_output[r * rows + c], src_c[c]) << "at (" << r
- << ", " << c << ")";
+ ASSERT_EQ(expected_output[r * rows + c], src_c[c])
+ << "at (" << r << ", " << c << ")";
}
src_c += src_pitch;
}
@@ -305,6 +316,7 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
const int cols = 16;
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
src_c.SetPadding(10);
SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -340,6 +352,7 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
const int cols = 16;
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
src_c.SetPadding(10);
SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -370,6 +383,7 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
const int cols = 16;
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
src_c.SetPadding(10);
SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -392,7 +406,9 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
rnd.Reset(ACMRandom::DeterministicSeed());
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_asm.Init());
for (int level = 0; level < 100; level++) {
src_c.SetPadding(10);
diff --git a/libvpx/test/predict_test.cc b/libvpx/test/predict_test.cc
index a6e2b3cf3..9f366ae52 100644
--- a/libvpx/test/predict_test.cc
+++ b/libvpx/test/predict_test.cc
@@ -324,6 +324,15 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
#endif
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+ MMI, SixtapPredictTest,
+ ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi),
+ make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi),
+ make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi),
+ make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
+#endif
+
class BilinearPredictTest : public PredictTestBase {};
TEST_P(BilinearPredictTest, TestWithRandomData) {
diff --git a/libvpx/test/quantize_test.cc b/libvpx/test/quantize_test.cc
index 69da8994c..40bb2642e 100644
--- a/libvpx/test/quantize_test.cc
+++ b/libvpx/test/quantize_test.cc
@@ -200,4 +200,12 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
#endif // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+ MMI, QuantizeTest,
+ ::testing::Values(
+ make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
+ make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
+#endif // HAVE_MMI
} // namespace
diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h
index 84641c8e9..a779e5c06 100644
--- a/libvpx/test/register_state_check.h
+++ b/libvpx/test/register_state_check.h
@@ -113,8 +113,8 @@ class RegisterStateCheck {
int64_t post_store[8];
vpx_push_neon(post_store);
for (int i = 0; i < 8; ++i) {
- EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8
- << " has been modified";
+ EXPECT_EQ(pre_store_[i], post_store[i])
+ << "d" << i + 8 << " has been modified";
}
}
diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index c9950dd43..e95dc6651 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc
@@ -298,10 +298,10 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
unsigned int expected_h;
ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
&expected_h, 0);
- EXPECT_EQ(expected_w, info->w) << "Frame " << frame
- << " had unexpected width";
- EXPECT_EQ(expected_h, info->h) << "Frame " << frame
- << " had unexpected height";
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
}
}
@@ -513,10 +513,10 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
unsigned int expected_h;
ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
&expected_h, 1);
- EXPECT_EQ(expected_w, info->w) << "Frame " << frame
- << " had unexpected width";
- EXPECT_EQ(expected_h, info->h) << "Frame " << frame
- << " had unexpected height";
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
}
diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index fe3983eb7..67c3c5315 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc
@@ -644,19 +644,50 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
#if HAVE_NEON
const SadMxNParam neon_tests[] = {
SadMxNParam(64, 64, &vpx_sad64x64_neon),
+ SadMxNParam(64, 32, &vpx_sad64x32_neon),
SadMxNParam(32, 32, &vpx_sad32x32_neon),
+ SadMxNParam(16, 32, &vpx_sad16x32_neon),
SadMxNParam(16, 16, &vpx_sad16x16_neon),
SadMxNParam(16, 8, &vpx_sad16x8_neon),
SadMxNParam(8, 16, &vpx_sad8x16_neon),
SadMxNParam(8, 8, &vpx_sad8x8_neon),
+ SadMxNParam(8, 4, &vpx_sad8x4_neon),
+ SadMxNParam(4, 8, &vpx_sad4x8_neon),
SadMxNParam(4, 4, &vpx_sad4x4_neon),
};
INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+const SadMxNAvgParam avg_neon_tests[] = {
+ SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
+ SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
+ SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon),
+ SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon),
+ SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon),
+ SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon),
+ SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon),
+ SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon),
+ SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon),
+ SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon),
+ SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
+ SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
+ SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+
const SadMxNx4Param x4d_neon_tests[] = {
SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
+ SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
+ SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon),
SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon),
+ SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon),
+ SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon),
SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon),
+ SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon),
+ SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon),
+ SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon),
+ SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
+ SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
+ SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
};
INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
#endif // HAVE_NEON
@@ -865,6 +896,14 @@ const SadMxNx4Param x4d_avx2_tests[] = {
INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
#endif // HAVE_AVX2
+#if HAVE_AVX512
+const SadMxNx4Param x4d_avx512_tests[] = {
+ SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512),
+};
+INSTANTIATE_TEST_CASE_P(AVX512, SADx4Test,
+ ::testing::ValuesIn(x4d_avx512_tests));
+#endif // HAVE_AVX512
+
//------------------------------------------------------------------------------
// MIPS functions
#if HAVE_MSA
@@ -934,5 +973,84 @@ const SadMxNParam vsx_tests[] = {
SadMxNParam(16, 8, &vpx_sad16x8_vsx),
};
INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests));
+
+const SadMxNAvgParam avg_vsx_tests[] = {
+ SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx),
+ SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx),
+ SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx),
+ SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx),
+ SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx),
+ SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx),
+ SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx),
+ SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests));
+
+const SadMxNx4Param x4d_vsx_tests[] = {
+ SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx),
+ SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx),
+ SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx),
+ SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx),
+ SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx),
+ SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx),
+ SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx),
+ SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests));
#endif // HAVE_VSX
+
+//------------------------------------------------------------------------------
+// Loongson functions
+#if HAVE_MMI
+const SadMxNParam mmi_tests[] = {
+ SadMxNParam(64, 64, &vpx_sad64x64_mmi),
+ SadMxNParam(64, 32, &vpx_sad64x32_mmi),
+ SadMxNParam(32, 64, &vpx_sad32x64_mmi),
+ SadMxNParam(32, 32, &vpx_sad32x32_mmi),
+ SadMxNParam(32, 16, &vpx_sad32x16_mmi),
+ SadMxNParam(16, 32, &vpx_sad16x32_mmi),
+ SadMxNParam(16, 16, &vpx_sad16x16_mmi),
+ SadMxNParam(16, 8, &vpx_sad16x8_mmi),
+ SadMxNParam(8, 16, &vpx_sad8x16_mmi),
+ SadMxNParam(8, 8, &vpx_sad8x8_mmi),
+ SadMxNParam(8, 4, &vpx_sad8x4_mmi),
+ SadMxNParam(4, 8, &vpx_sad4x8_mmi),
+ SadMxNParam(4, 4, &vpx_sad4x4_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests));
+
+const SadMxNAvgParam avg_mmi_tests[] = {
+ SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi),
+ SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi),
+ SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi),
+ SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi),
+ SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi),
+ SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi),
+ SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi),
+ SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi),
+ SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi),
+ SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi),
+ SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi),
+ SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi),
+ SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests));
+
+const SadMxNx4Param x4d_mmi_tests[] = {
+ SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi),
+ SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi),
+ SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi),
+ SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi),
+ SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi),
+ SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi),
+ SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi),
+ SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi),
+ SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi),
+ SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi),
+ SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi),
+ SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi),
+ SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
+#endif // HAVE_MMI
} // namespace
diff --git a/libvpx/test/set_roi.cc b/libvpx/test/set_roi.cc
index 38711a806..f63954752 100644
--- a/libvpx/test/set_roi.cc
+++ b/libvpx/test/set_roi.cc
@@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
if (deltas_valid != roi_retval) break;
}
- // Test that we report and error if cyclic refresh is enabled.
- cpi.cyclic_refresh_mode_enabled = 1;
- roi_retval =
- vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols,
- delta_q, delta_lf, threshold);
- EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
- cpi.cyclic_refresh_mode_enabled = 0;
-
// Test invalid number of rows or colums.
roi_retval =
vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
diff --git a/libvpx/test/temporal_filter_test.cc b/libvpx/test/temporal_filter_test.cc
index 8615ba45a..655a36be9 100644
--- a/libvpx/test/temporal_filter_test.cc
+++ b/libvpx/test/temporal_filter_test.cc
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <limits>
+
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vp9_rtcd.h"
@@ -35,6 +37,7 @@ void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
Buffer<unsigned int> *accumulator,
Buffer<uint16_t> *count) {
Buffer<int> diff_sq = Buffer<int>(w, h, 0);
+ ASSERT_TRUE(diff_sq.Init());
diff_sq.Set(0);
int rounding = 0;
@@ -119,6 +122,7 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
// Depending on subsampling this function may be called with values of 8 or 16
// for width and height, in any combination.
Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+ ASSERT_TRUE(a.Init());
const int filter_weight = 2;
const int filter_strength = 6;
@@ -127,13 +131,20 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
for (int height = 8; height <= 16; height += 8) {
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+ ASSERT_TRUE(b.Init());
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_ref.Init());
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_chk.Init());
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_ref.Init());
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_chk.Init());
- a.Set(&rnd_, &ACMRandom::Rand8);
- b.Set(&rnd_, &ACMRandom::Rand8);
+ // The difference between the buffers must be small to pass the threshold
+ // to apply the filter.
+ a.Set(&rnd_, 0, 7);
+ b.Set(&rnd_, 0, 7);
accum_ref.Set(rnd_.Rand8());
accum_chk.CopyFrom(accum_ref);
@@ -161,18 +172,32 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) {
for (int width = 8; width <= 16; width += 8) {
for (int height = 8; height <= 16; height += 8) {
Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
+ ASSERT_TRUE(a.Init());
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+ ASSERT_TRUE(b.Init());
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_ref.Init());
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_chk.Init());
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_ref.Init());
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_chk.Init());
for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
- for (int repeat = 0; repeat < 10; ++repeat) {
- a.Set(&rnd_, &ACMRandom::Rand8);
- b.Set(&rnd_, &ACMRandom::Rand8);
+ for (int repeat = 0; repeat < 100; ++repeat) {
+ if (repeat < 50) {
+ a.Set(&rnd_, 0, 7);
+ b.Set(&rnd_, 0, 7);
+ } else {
+ // Check large (but close) values as well.
+ a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+ std::numeric_limits<uint8_t>::max());
+ b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+ std::numeric_limits<uint8_t>::max());
+ }
accum_ref.Set(rnd_.Rand8());
accum_chk.CopyFrom(accum_ref);
@@ -202,6 +227,7 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) {
TEST_P(TemporalFilterTest, DISABLED_Speed) {
Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+ ASSERT_TRUE(a.Init());
const int filter_weight = 2;
const int filter_strength = 6;
@@ -210,13 +236,18 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
for (int height = 8; height <= 16; height += 8) {
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+ ASSERT_TRUE(b.Init());
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_ref.Init());
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_chk.Init());
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_ref.Init());
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_chk.Init());
- a.Set(&rnd_, &ACMRandom::Rand8);
- b.Set(&rnd_, &ACMRandom::Rand8);
+ a.Set(&rnd_, 0, 7);
+ b.Set(&rnd_, 0, 7);
accum_chk.Set(0);
count_chk.Set(0);
diff --git a/libvpx/test/test-data.mk b/libvpx/test/test-data.mk
index b39ab8763..f405e4ef1 100644
--- a/libvpx/test/test-data.mk
+++ b/libvpx/test/test-data.mk
@@ -732,6 +732,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
endif # CONFIG_VP9_HIGHBITDEPTH
# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
@@ -772,6 +774,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s367
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
@@ -874,3 +878,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 22ca6f564..99b4e1e46 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -6,6 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
@@ -848,3 +850,7 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm
e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
+fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
+fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
+e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index c51e645c1..a3716be60 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -39,7 +39,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
@@ -124,6 +123,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
LIBVPX_TEST_SRCS-yes += idct_test.cc
LIBVPX_TEST_SRCS-yes += predict_test.cc
LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc
+LIBVPX_TEST_SRCS-yes += vpx_scale_test.h
ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
@@ -154,11 +154,15 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
+endif
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
diff --git a/libvpx/test/test_intra_pred_speed.cc b/libvpx/test/test_intra_pred_speed.cc
index 23fce335a..1cdeda410 100644
--- a/libvpx/test/test_intra_pred_speed.cc
+++ b/libvpx/test/test_intra_pred_speed.cc
@@ -480,29 +480,70 @@ HIGHBD_INTRA_PRED_TEST(
vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c)
#if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4,
- vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL,
- vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c)
+HIGHBD_INTRA_PRED_TEST(
+ SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2,
+ vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2,
+ vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2,
+ vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2,
+ vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2,
+ vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2,
+ vpx_highbd_tm_predictor_4x4_c)
HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
- vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL,
- vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+ vpx_highbd_dc_predictor_8x8_sse2,
+ vpx_highbd_dc_left_predictor_8x8_sse2,
+ vpx_highbd_dc_top_predictor_8x8_sse2,
+ vpx_highbd_dc_128_predictor_8x8_sse2,
+ vpx_highbd_v_predictor_8x8_sse2,
+ vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
- vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL,
- vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL,
- vpx_highbd_tm_predictor_16x16_sse2)
+ vpx_highbd_dc_predictor_16x16_sse2,
+ vpx_highbd_dc_left_predictor_16x16_sse2,
+ vpx_highbd_dc_top_predictor_16x16_sse2,
+ vpx_highbd_dc_128_predictor_16x16_sse2,
+ vpx_highbd_v_predictor_16x16_sse2,
+ vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
+ NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2)
HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
- vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL,
- vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL,
- vpx_highbd_tm_predictor_32x32_sse2)
+ vpx_highbd_dc_predictor_32x32_sse2,
+ vpx_highbd_dc_left_predictor_32x32_sse2,
+ vpx_highbd_dc_top_predictor_32x32_sse2,
+ vpx_highbd_dc_128_predictor_32x32_sse2,
+ vpx_highbd_v_predictor_32x32_sse2,
+ vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
+ NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2)
#endif // HAVE_SSE2
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL,
+ NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3,
+ vpx_highbd_d135_predictor_8x8_ssse3,
+ vpx_highbd_d117_predictor_8x8_ssse3,
+ vpx_highbd_d153_predictor_8x8_ssse3,
+ vpx_highbd_d207_predictor_8x8_ssse3,
+ vpx_highbd_d63_predictor_8x8_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3,
+ vpx_highbd_d135_predictor_16x16_ssse3,
+ vpx_highbd_d117_predictor_16x16_ssse3,
+ vpx_highbd_d153_predictor_16x16_ssse3,
+ vpx_highbd_d207_predictor_16x16_ssse3,
+ vpx_highbd_d63_predictor_16x16_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL,
+ NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3,
+ vpx_highbd_d135_predictor_32x32_ssse3,
+ vpx_highbd_d117_predictor_32x32_ssse3,
+ vpx_highbd_d153_predictor_32x32_ssse3,
+ vpx_highbd_d207_predictor_32x32_ssse3,
+ vpx_highbd_d63_predictor_32x32_ssse3, NULL)
+#endif // HAVE_SSSE3
+
#if HAVE_NEON
HIGHBD_INTRA_PRED_TEST(
NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
diff --git a/libvpx/test/test_libvpx.cc b/libvpx/test/test_libvpx.cc
index 8a70b4e28..30641ae8c 100644
--- a/libvpx/test/test_libvpx.cc
+++ b/libvpx/test/test_libvpx.cc
@@ -53,6 +53,9 @@ int main(int argc, char **argv) {
}
if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+ if (!(simd_caps & HAS_AVX512)) {
+ append_negative_gtest_filter(":AVX512.*:AVX512/*");
+ }
#endif // ARCH_X86 || ARCH_X86_64
#if !CONFIG_SHARED
diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc
index 14c509d5c..1879b3d27 100644
--- a/libvpx/test/test_vector_test.cc
+++ b/libvpx/test/test_vector_test.cc
@@ -28,13 +28,10 @@
namespace {
-enum DecodeMode { kSerialMode, kFrameParallelMode };
+const int kThreads = 0;
+const int kFileName = 1;
-const int kDecodeMode = 0;
-const int kThreads = 1;
-const int kFileName = 2;
-
-typedef std::tr1::tuple<int, int, const char *> DecodeParam;
+typedef std::tr1::tuple<int, const char *> DecodeParam;
class TestVectorTest : public ::libvpx_test::DecoderTest,
public ::libvpx_test::CodecTestWithParam<DecodeParam> {
@@ -53,8 +50,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
void OpenMD5File(const std::string &md5_file_name_) {
md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
- ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
- << md5_file_name_;
+ ASSERT_TRUE(md5_file_ != NULL)
+ << "Md5 file open failed. Filename: " << md5_file_name_;
}
virtual void DecompressedFrameHook(const vpx_image_t &img,
@@ -92,29 +89,14 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
TEST_P(TestVectorTest, MD5Match) {
const DecodeParam input = GET_PARAM(1);
const std::string filename = std::tr1::get<kFileName>(input);
- const int threads = std::tr1::get<kThreads>(input);
- const int mode = std::tr1::get<kDecodeMode>(input);
vpx_codec_flags_t flags = 0;
vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
char str[256];
- if (mode == kFrameParallelMode) {
- flags |= VPX_CODEC_USE_FRAME_THREADING;
-#if CONFIG_VP9_DECODER
- // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
- if (resize_clips_.find(filename) != resize_clips_.end()) {
- printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
- filename.c_str());
- return;
- }
-#endif
- }
-
- cfg.threads = threads;
+ cfg.threads = std::tr1::get<kThreads>(input);
- snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
- "file: %s mode: %s threads: %d", filename.c_str(),
- mode == 0 ? "Serial" : "Parallel", threads);
+ snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+ filename.c_str(), cfg.threads);
SCOPED_TRACE(str);
// Open compressed video file.
@@ -145,13 +127,10 @@ TEST_P(TestVectorTest, MD5Match) {
ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
}
-// Test VP8 decode in serial mode with single thread.
-// NOTE: VP8 only support serial mode.
#if CONFIG_VP8_DECODER
VP8_INSTANTIATE_TEST_CASE(
TestVectorTest,
::testing::Combine(
- ::testing::Values(0), // Serial Mode.
::testing::Values(1), // Single thread.
::testing::ValuesIn(libvpx_test::kVP8TestVectors,
libvpx_test::kVP8TestVectors +
@@ -164,33 +143,28 @@ INSTANTIATE_TEST_CASE_P(
::testing::Values(
static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
::testing::Combine(
- ::testing::Values(0), // Serial Mode.
- ::testing::Range(1, 8), // With 1 ~ 8 threads.
+ ::testing::Range(2, 9), // With 2 ~ 8 threads.
::testing::ValuesIn(libvpx_test::kVP8TestVectors,
libvpx_test::kVP8TestVectors +
libvpx_test::kNumVP8TestVectors))));
#endif // CONFIG_VP8_DECODER
-// Test VP9 decode in serial mode with single thread.
#if CONFIG_VP9_DECODER
VP9_INSTANTIATE_TEST_CASE(
TestVectorTest,
::testing::Combine(
- ::testing::Values(0), // Serial Mode.
::testing::Values(1), // Single thread.
::testing::ValuesIn(libvpx_test::kVP9TestVectors,
libvpx_test::kVP9TestVectors +
libvpx_test::kNumVP9TestVectors)));
-// Test VP9 decode in frame parallel mode with different number of threads.
INSTANTIATE_TEST_CASE_P(
- DISABLED_VP9MultiThreadedFrameParallel, TestVectorTest,
+ VP9MultiThreaded, TestVectorTest,
::testing::Combine(
::testing::Values(
static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
::testing::Combine(
- ::testing::Values(1), // Frame Parallel mode.
::testing::Range(2, 9), // With 2 ~ 8 threads.
::testing::ValuesIn(libvpx_test::kVP9TestVectors,
libvpx_test::kVP9TestVectors +
diff --git a/libvpx/test/test_vectors.cc b/libvpx/test/test_vectors.cc
index def78da28..3ffc3efc4 100644
--- a/libvpx/test/test_vectors.cc
+++ b/libvpx/test/test_vectors.cc
@@ -371,6 +371,7 @@ const char *const kVP9TestVectors[] = {
#endif // CONFIG_VP9_HIGHBITDEPTH
"vp90-2-20-big_superframe-01.webm",
"vp90-2-20-big_superframe-02.webm",
+ "vp90-2-22-svc_1280x720_1.webm",
RESIZE_TEST_VECTORS
};
const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" };
diff --git a/libvpx/test/twopass_encoder.sh b/libvpx/test/twopass_encoder.sh
index 7a223f2af..eaeaabdfd 100755
--- a/libvpx/test/twopass_encoder.sh
+++ b/libvpx/test/twopass_encoder.sh
@@ -54,7 +54,10 @@ twopass_encoder_vp9() {
fi
}
-twopass_encoder_tests="twopass_encoder_vp8
- twopass_encoder_vp9"
-run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+ twopass_encoder_tests="twopass_encoder_vp8
+ twopass_encoder_vp9"
+
+ run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+fi
diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index d607a097d..421024ad8 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc
@@ -492,7 +492,7 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
- for (int i = 0; i < 100000000 / block_size(); ++i) {
+ for (int i = 0; i < (1 << 30) / block_size(); ++i) {
const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse);
// Ignore return value.
(void)variance;
@@ -561,46 +561,26 @@ void MainTestClass<FunctionType>::MaxTestSse() {
////////////////////////////////////////////////////////////////////////////////
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
-
-template <typename SubpelVarianceFunctionType>
+template <typename FunctionType>
class SubpelVarianceTest
- : public ::testing::TestWithParam<
- tuple<int, int, SubpelVarianceFunctionType, int> > {
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
public:
virtual void SetUp() {
- const tuple<int, int, SubpelVarianceFunctionType, int> &params =
- this->GetParam();
- log2width_ = get<0>(params);
- width_ = 1 << log2width_;
- log2height_ = get<1>(params);
- height_ = 1 << log2height_;
- subpel_variance_ = get<2>(params);
- if (get<3>(params)) {
- bit_depth_ = (vpx_bit_depth_t)get<3>(params);
- use_high_bit_depth_ = true;
- } else {
- bit_depth_ = VPX_BITS_8;
- use_high_bit_depth_ = false;
- }
- mask_ = (1 << bit_depth_) - 1;
+ params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
- block_size_ = width_ * height_;
- if (!use_high_bit_depth_) {
- src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
- sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
- ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+ if (!use_high_bit_depth()) {
+ src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+ sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+ ref_ = new uint8_t[block_size() + width() + height() + 1];
#if CONFIG_VP9_HIGHBITDEPTH
} else {
src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
- vpx_memalign(16, block_size_ * sizeof(uint16_t))));
+ vpx_memalign(16, block_size() * sizeof(uint16_t))));
sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
- vpx_memalign(16, block_size_ * sizeof(uint16_t))));
- ref_ =
- CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]);
+ vpx_memalign(16, block_size() * sizeof(uint16_t))));
+ ref_ = CONVERT_TO_BYTEPTR(
+ new uint16_t[block_size() + width() + height() + 1]);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
ASSERT_TRUE(src_ != NULL);
@@ -609,7 +589,7 @@ class SubpelVarianceTest
}
virtual void TearDown() {
- if (!use_high_bit_depth_) {
+ if (!use_high_bit_depth()) {
vpx_free(src_);
delete[] ref_;
vpx_free(sec_);
@@ -631,42 +611,45 @@ class SubpelVarianceTest
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
- bool use_high_bit_depth_;
- vpx_bit_depth_t bit_depth_;
- int width_, log2width_;
- int height_, log2height_;
- int block_size_, mask_;
- SubpelVarianceFunctionType subpel_variance_;
+ TestParams<FunctionType> params_;
+
+ // some relay helpers
+ bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+ int byte_shift() const { return params_.bit_depth - 8; }
+ int block_size() const { return params_.block_size; }
+ int width() const { return params_.width; }
+ int height() const { return params_.height; }
+ uint32_t mask() const { return params_.mask; }
};
template <typename SubpelVarianceFunctionType>
void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
- if (!use_high_bit_depth_) {
- for (int j = 0; j < block_size_; j++) {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
src_[j] = rnd_.Rand8();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
ref_[j] = rnd_.Rand8();
}
#if CONFIG_VP9_HIGHBITDEPTH
} else {
- for (int j = 0; j < block_size_; j++) {
- CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(
- var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
- const unsigned int var2 =
- subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
- use_high_bit_depth_, bit_depth_);
+ var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+ const unsigned int var2 = subpel_variance_ref(
+ ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
@@ -680,28 +663,28 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
// Ref: Set the first half of values to the maximum, the second half to 0.
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
- const int half = block_size_ / 2;
- if (!use_high_bit_depth_) {
+ const int half = block_size() / 2;
+ if (!use_high_bit_depth()) {
memset(src_, 0, half);
memset(src_ + half, 255, half);
memset(ref_, 255, half);
- memset(ref_ + half, 0, half + width_ + height_ + 1);
+ memset(ref_ + half, 0, half + width() + height() + 1);
#if CONFIG_VP9_HIGHBITDEPTH
} else {
- vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
- vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
- half + width_ + height_ + 1);
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+ half + width() + height() + 1);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(
- var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
- const unsigned int var2 =
- subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
- use_high_bit_depth_, bit_depth_);
+ var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+ const unsigned int var2 = subpel_variance_ref(
+ ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
}
@@ -712,33 +695,32 @@ template <>
void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
- if (!use_high_bit_depth_) {
- for (int j = 0; j < block_size_; j++) {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
src_[j] = rnd_.Rand8();
sec_[j] = rnd_.Rand8();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
ref_[j] = rnd_.Rand8();
}
#if CONFIG_VP9_HIGHBITDEPTH
} else {
- for (int j = 0; j < block_size_; j++) {
- CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
- CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
uint32_t sse1, sse2;
uint32_t var1, var2;
- ASM_REGISTER_STATE_CHECK(var1 =
- subpel_variance_(ref_, width_ + 1, x, y,
- src_, width_, &sse1, sec_));
- var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_,
- x, y, &sse2, use_high_bit_depth_,
- static_cast<vpx_bit_depth_t>(bit_depth_));
+ ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+ src_, width(), &sse1, sec_));
+ var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+ params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
@@ -798,37 +780,41 @@ INSTANTIATE_TEST_CASE_P(
VarianceParams(2, 3, &vpx_variance4x8_c),
VarianceParams(2, 2, &vpx_variance4x4_c)));
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
INSTANTIATE_TEST_CASE_P(
C, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
-
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
INSTANTIATE_TEST_CASE_P(
C, VpxSubpelAvgVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
#if CONFIG_VP9_HIGHBITDEPTH
typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
@@ -850,18 +836,18 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
/* TODO(debargha): This test does not support the highbd version
INSTANTIATE_TEST_CASE_P(
C, VpxHBDMseTest,
- ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
- make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
- make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
- make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
- make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
- make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
- make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
- make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
- make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
- make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
- make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
- make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
+ ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
+ MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
+ MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
+ MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
+ MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
+ MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
+ MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
+ MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
+ MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
+ MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
+ MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
+ MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
*/
INSTANTIATE_TEST_CASE_P(
@@ -909,88 +895,161 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
C, VpxHBDSubpelVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
- make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
- make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
- make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
- make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
- make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
- make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+ SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+ SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+ SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+ SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+ SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+ SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+ SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+ SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+ SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+ SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+ SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+ SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+ SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+ SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c,
+ 10),
+ SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c,
+ 10),
+ SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c,
+ 10),
+ SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c,
+ 10),
+ SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c,
+ 10),
+ SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c,
+ 10),
+ SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c,
+ 10),
+ SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+ SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+ SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+ SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+ SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+ SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+ SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c,
+ 12),
+ SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c,
+ 12),
+ SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c,
+ 12),
+ SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c,
+ 12),
+ SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c,
+ 12),
+ SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c,
+ 12),
+ SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c,
+ 12),
+ SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+ SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+ SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+ SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+ SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+ SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c,
+ 12)));
INSTANTIATE_TEST_CASE_P(
C, VpxHBDSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
- make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
- make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
- make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
- make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
- make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
- make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+ SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c,
+ 8),
+ SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c,
+ 8),
+ SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c,
+ 8),
+ SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c,
+ 8),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance64x64_c,
+ 10),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance64x32_c,
+ 10),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance32x64_c,
+ 10),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance32x32_c,
+ 10),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance32x16_c,
+ 10),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance16x32_c,
+ 10),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance16x16_c,
+ 10),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance16x8_c,
+ 10),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance8x16_c,
+ 10),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+ SubpelAvgVarianceParams(2, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+ SubpelAvgVarianceParams(2, 2,
+ &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance64x64_c,
+ 12),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance64x32_c,
+ 12),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance32x64_c,
+ 12),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance32x32_c,
+ 12),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance32x16_c,
+ 12),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance16x32_c,
+ 12),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance16x16_c,
+ 12),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance16x8_c,
+ 12),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance8x16_c,
+ 12),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+ SubpelAvgVarianceParams(2, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+ SubpelAvgVarianceParams(2, 2,
+ &vpx_highbd_12_sub_pixel_avg_variance4x4_c,
+ 12)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSE2
@@ -1021,36 +1080,37 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
#if CONFIG_VP9_HIGHBITDEPTH
/* TODO(debargha): This test does not support the highbd version
@@ -1107,112 +1167,219 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, VpxHBDSubpelVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+ SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
+ 12),
+ SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2,
+ 12),
+ SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2,
+ 12),
+ SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2,
+ 12),
+ SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2,
+ 12),
+ SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2,
+ 12),
+ SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2,
+ 12),
+ SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2,
+ 12),
+ SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2,
+ 12),
+ SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2,
+ 12),
+ SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2,
+ 12),
+ SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2,
+ 10),
+ SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2,
+ 10),
+ SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2,
+ 10),
+ SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2,
+ 10),
+ SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2,
+ 10),
+ SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2,
+ 10),
+ SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2,
+ 10),
+ SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2,
+ 10),
+ SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2,
+ 10),
+ SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2,
+ 10),
+ SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2,
+ 10),
+ SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2,
+ 8),
+ SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2,
+ 8),
+ SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2,
+ 8),
+ SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2,
+ 8),
+ SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2,
+ 8),
+ SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2,
+ 8),
+ SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2,
+ 8),
+ SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2,
+ 8),
+ SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2,
+ 8),
+ SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+ SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2,
+ 8)));
INSTANTIATE_TEST_CASE_P(
SSE2, VpxHBDSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2,
+ 12),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2,
+ 12),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2,
+ 10),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2,
+ 10),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2,
+ 8),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2,
+ 8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
INSTANTIATE_TEST_CASE_P(
SSSE3, VpxSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3,
+ 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3,
+ 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3,
+ 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3,
+ 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3,
+ 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3,
+ 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3,
+ 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3,
+ 0)));
#endif // HAVE_SSSE3
#if HAVE_AVX2
@@ -1229,14 +1396,16 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
AVX2, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
INSTANTIATE_TEST_CASE_P(
AVX2, VpxSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2,
+ 0)));
#endif // HAVE_AVX2
#if HAVE_NEON
@@ -1265,17 +1434,37 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
NEON, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+ NEON, VpxSubpelAvgVarianceTest,
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
#endif // HAVE_NEON
#if HAVE_MSA
@@ -1310,35 +1499,37 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
MSA, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
- make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
INSTANTIATE_TEST_CASE_P(
MSA, VpxSubpelAvgVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
#endif // HAVE_MSA
#if HAVE_VSX
@@ -1349,4 +1540,62 @@ INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest,
::testing::Values(SseParams(2, 2,
&vpx_get4x4sse_cs_vsx)));
#endif // HAVE_VSX
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest,
+ ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi),
+ MseParams(4, 3, &vpx_mse16x8_mmi),
+ MseParams(3, 4, &vpx_mse8x16_mmi),
+ MseParams(3, 3, &vpx_mse8x8_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+ MMI, VpxVarianceTest,
+ ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi),
+ VarianceParams(6, 5, &vpx_variance64x32_mmi),
+ VarianceParams(5, 6, &vpx_variance32x64_mmi),
+ VarianceParams(5, 5, &vpx_variance32x32_mmi),
+ VarianceParams(5, 4, &vpx_variance32x16_mmi),
+ VarianceParams(4, 5, &vpx_variance16x32_mmi),
+ VarianceParams(4, 4, &vpx_variance16x16_mmi),
+ VarianceParams(4, 3, &vpx_variance16x8_mmi),
+ VarianceParams(3, 4, &vpx_variance8x16_mmi),
+ VarianceParams(3, 3, &vpx_variance8x8_mmi),
+ VarianceParams(3, 2, &vpx_variance8x4_mmi),
+ VarianceParams(2, 3, &vpx_variance4x8_mmi),
+ VarianceParams(2, 2, &vpx_variance4x4_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+ MMI, VpxSubpelVarianceTest,
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+ MMI, VpxSubpelAvgVarianceTest,
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
+#endif // HAVE_MMI
} // namespace
diff --git a/libvpx/test/vp8_fdct4x4_test.cc b/libvpx/test/vp8_fdct4x4_test.cc
index 9f69ae164..b7697d859 100644
--- a/libvpx/test/vp8_fdct4x4_test.cc
+++ b/libvpx/test/vp8_fdct4x4_test.cc
@@ -199,4 +199,8 @@ INSTANTIATE_TEST_CASE_P(SSE2, FdctTest,
INSTANTIATE_TEST_CASE_P(MSA, FdctTest,
::testing::Values(vp8_short_fdct4x4_msa));
#endif // HAVE_MSA
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, FdctTest,
+ ::testing::Values(vp8_short_fdct4x4_mmi));
+#endif // HAVE_MMI
} // namespace
diff --git a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
index 53dc8c9fe..62e8dcb9b 100644
--- a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
@@ -99,9 +99,7 @@ class VpxEncoderParmsGetToDecoder
vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
vpx_codec_alg_priv_t *const priv =
reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
- FrameWorkerData *const worker_data =
- reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
- VP9_COMMON *const common = &worker_data->pbi->common;
+ VP9_COMMON *const common = &priv->pbi->common;
if (encode_parms.lossless) {
EXPECT_EQ(0, common->base_qindex);
diff --git a/libvpx/test/vp9_ethread_test.cc b/libvpx/test/vp9_ethread_test.cc
index 4e8d814c1..6b7e51211 100644
--- a/libvpx/test/vp9_ethread_test.cc
+++ b/libvpx/test/vp9_ethread_test.cc
@@ -50,7 +50,6 @@ class VPxFirstPassEncoderThreadTest
InitializeConfig();
SetMode(encoding_mode_);
- cfg_.g_lag_in_frames = 3;
cfg_.rc_end_usage = VPX_VBR;
cfg_.rc_2pass_vbr_minsection_pct = 5;
cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -128,8 +127,10 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) {
const double *frame_stats2 = reinterpret_cast<double *>(stats2);
for (j = 0; j < kDbl; ++j) {
- EXPECT_LE(fabs(*frame_stats1 - *frame_stats2),
- fabs(*frame_stats1) / factor);
+ ASSERT_LE(fabs(*frame_stats1 - *frame_stats2),
+ fabs(*frame_stats1) / factor)
+ << "First failure @ frame #" << i << " stat #" << j << " ("
+ << *frame_stats1 << " vs. " << *frame_stats2 << ")";
frame_stats1++;
frame_stats2++;
}
@@ -183,7 +184,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
// Compare to check if using or not using row-mt generates close stats.
- compare_fp_stats(&firstpass_stats_, 1000.0);
+ ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
// Test single thread vs multiple threads
row_mt_mode_ = 1;
@@ -197,7 +198,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
// Compare to check if single-thread and multi-thread stats are close enough.
- compare_fp_stats(&firstpass_stats_, 1000.0);
+ ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
// Bit exact test in row_mt mode.
// When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
@@ -238,7 +239,6 @@ class VPxEncoderThreadTest
SetMode(encoding_mode_);
if (encoding_mode_ != ::libvpx_test::kRealTime) {
- cfg_.g_lag_in_frames = 3;
cfg_.rc_end_usage = VPX_VBR;
cfg_.rc_2pass_vbr_minsection_pct = 5;
cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -340,8 +340,6 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
ASSERT_EQ(single_thr_md5, multi_thr_md5);
// Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test.
- // The first-pass stats are not bit exact here, but that difference doesn't
- // cause a mismatch between the final bitstreams.
row_mt_mode_ = 1;
// Encode using single thread
diff --git a/libvpx/test/vp9_frame_parallel_test.cc b/libvpx/test/vp9_frame_parallel_test.cc
deleted file mode 100644
index 136557720..000000000
--- a/libvpx/test/vp9_frame_parallel_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/ivf_video_source.h"
-#include "test/md5_helper.h"
-#include "test/util.h"
-#if CONFIG_WEBM_IO
-#include "test/webm_video_source.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-using std::string;
-
-#if CONFIG_WEBM_IO
-
-struct PauseFileList {
- const char *name;
- // md5 sum for decoded frames which does not include skipped frames.
- const char *expected_md5;
- const int pause_frame_num;
-};
-
-// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
-// seek to next key frame and then continue decoding until the end. Return
-// the md5 of the decoded frames which does not include skipped frames.
-string DecodeFileWithPause(const string &filename, int num_threads,
- int pause_num) {
- libvpx_test::WebMVideoSource video(filename);
- video.Init();
- int in_frames = 0;
- int out_frames = 0;
-
- vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
- cfg.threads = num_threads;
- vpx_codec_flags_t flags = 0;
- flags |= VPX_CODEC_USE_FRAME_THREADING;
- libvpx_test::VP9Decoder decoder(cfg, flags);
-
- libvpx_test::MD5 md5;
- video.Begin();
-
- do {
- ++in_frames;
- const vpx_codec_err_t res =
- decoder.DecodeFrame(video.cxdata(), video.frame_size());
- if (res != VPX_CODEC_OK) {
- EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
- break;
- }
-
- // Pause at specified frame number.
- if (in_frames == pause_num) {
- // Flush the decoder and then seek to next key frame.
- decoder.DecodeFrame(NULL, 0);
- video.SeekToNextKeyFrame();
- } else {
- video.Next();
- }
-
- // Flush the decoder at the end of the video.
- if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
- libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
- const vpx_image_t *img;
-
- // Get decompressed data
- while ((img = dec_iter.Next())) {
- ++out_frames;
- md5.Add(img);
- }
- } while (video.cxdata() != NULL);
-
- EXPECT_EQ(in_frames, out_frames)
- << "Input frame count does not match output frame count";
-
- return string(md5.Get());
-}
-
-void DecodeFilesWithPause(const PauseFileList files[]) {
- for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
- SCOPED_TRACE(iter->name);
- for (int t = 2; t <= 8; ++t) {
- EXPECT_EQ(iter->expected_md5,
- DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
- << "threads = " << t;
- }
- }
-}
-
-TEST(DISABLED_VP9MultiThreadedFrameParallel, PauseSeekResume) {
- // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
- // one key frame for every ten frames.
- static const PauseFileList files[] = {
- { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1",
- 6 },
- { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45",
- 12 },
- { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8",
- 26 },
- { NULL, NULL, 0 },
- };
- DecodeFilesWithPause(files);
-}
-
-struct FileList {
- const char *name;
- // md5 sum for decoded frames which does not include corrupted frames.
- const char *expected_md5;
- // Expected number of decoded frames which does not include corrupted frames.
- const int expected_frame_count;
-};
-
-// Decodes |filename| with |num_threads|. Return the md5 of the decoded
-// frames which does not include corrupted frames.
-string DecodeFile(const string &filename, int num_threads,
- int expected_frame_count) {
- libvpx_test::WebMVideoSource video(filename);
- video.Init();
-
- vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
- cfg.threads = num_threads;
- const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
- libvpx_test::VP9Decoder decoder(cfg, flags);
-
- libvpx_test::MD5 md5;
- video.Begin();
-
- int out_frames = 0;
- do {
- const vpx_codec_err_t res =
- decoder.DecodeFrame(video.cxdata(), video.frame_size());
- // TODO(hkuang): frame parallel mode should return an error on corruption.
- if (res != VPX_CODEC_OK) {
- EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
- break;
- }
-
- video.Next();
-
- // Flush the decoder at the end of the video.
- if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
- libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
- const vpx_image_t *img;
-
- // Get decompressed data
- while ((img = dec_iter.Next())) {
- ++out_frames;
- md5.Add(img);
- }
- } while (video.cxdata() != NULL);
-
- EXPECT_EQ(expected_frame_count, out_frames)
- << "Input frame count does not match expected output frame count";
-
- return string(md5.Get());
-}
-
-void DecodeFiles(const FileList files[]) {
- for (const FileList *iter = files; iter->name != NULL; ++iter) {
- SCOPED_TRACE(iter->name);
- for (int t = 2; t <= 8; ++t) {
- EXPECT_EQ(iter->expected_md5,
- DecodeFile(iter->name, t, iter->expected_frame_count))
- << "threads = " << t;
- }
- }
-}
-
-TEST(DISABLED_VP9MultiThreadedFrameParallel, InvalidFileTest) {
- static const FileList files[] = {
- // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
- // one key frame for every ten frames. The 11th frame has corrupted data.
- { "invalid-vp90-2-07-frame_parallel-1.webm",
- "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
- // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
- // one key frame for every ten frames. The 1st and 31st frames have
- // corrupted data.
- { "invalid-vp90-2-07-frame_parallel-2.webm",
- "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
- // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
- // one key frame for every ten frames. The 5th and 13th frames have
- // corrupted data.
- { "invalid-vp90-2-07-frame_parallel-3.webm",
- "8256544308de926b0681e04685b98677", 27 },
- { NULL, NULL, 0 },
- };
- DecodeFiles(files);
-}
-
-TEST(DISABLED_VP9MultiThreadedFrameParallel, ValidFileTest) {
- static const FileList files[] = {
-#if CONFIG_VP9_HIGHBITDEPTH
- { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 },
-#endif
- { NULL, NULL, 0 },
- };
- DecodeFiles(files);
-}
-#endif // CONFIG_WEBM_IO
-} // namespace
diff --git a/libvpx/test/vp9_intrapred_test.cc b/libvpx/test/vp9_intrapred_test.cc
index bee0213ea..39c5e79eb 100644
--- a/libvpx/test/vp9_intrapred_test.cc
+++ b/libvpx/test/vp9_intrapred_test.cc
@@ -467,10 +467,164 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
RunTest(left_col, above_data, dst, ref_dst);
}
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3_TO_C_8, VP9HighbdIntraPredTest,
+ ::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+ &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+ &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+ &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+ &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+ &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+ &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+ &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+ &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+ &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+ &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+ &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+ &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+ &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+ &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+ &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+ &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+ &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+ &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+ &vpx_highbd_d207_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+ SSSE3_TO_C_10, VP9HighbdIntraPredTest,
+ ::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+ &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+ &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+ &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+ &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+ &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+ &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+ &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+ &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+ &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+ &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+ &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+ &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+ &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+ &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+ &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+ &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+ &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+ &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+ &vpx_highbd_d207_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+ SSSE3_TO_C_12, VP9HighbdIntraPredTest,
+ ::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+ &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+ &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+ &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+ &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+ &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+ &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+ &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+ &vpx_highbd_d117_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+ &vpx_highbd_d117_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+ &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+ &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+ &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+ &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+ &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+ &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+ &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+ &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+ &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+ &vpx_highbd_d207_predictor_32x32_c, 32, 12)));
+#endif // HAVE_SSSE3
+
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2_TO_C_8, VP9HighbdIntraPredTest,
::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+ &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+ &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+ &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+ &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+ &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+ &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+ &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+ &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+ &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+ &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+ &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+ &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+ &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -479,6 +633,14 @@ INSTANTIATE_TEST_CASE_P(
&vpx_highbd_dc_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
&vpx_highbd_dc_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+ &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+ &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+ &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+ &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
&vpx_highbd_tm_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -487,6 +649,14 @@ INSTANTIATE_TEST_CASE_P(
&vpx_highbd_tm_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
&vpx_highbd_tm_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+ &vpx_highbd_h_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+ &vpx_highbd_h_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+ &vpx_highbd_h_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+ &vpx_highbd_h_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@@ -499,6 +669,32 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2_TO_C_10, VP9HighbdIntraPredTest,
::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+ &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+ &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+ &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+ &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+ &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+ &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+ &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+ &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+ &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+ &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+ &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+ &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+ &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -507,6 +703,14 @@ INSTANTIATE_TEST_CASE_P(
&vpx_highbd_dc_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
&vpx_highbd_dc_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+ &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+ &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+ &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+ &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
&vpx_highbd_tm_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -515,6 +719,14 @@ INSTANTIATE_TEST_CASE_P(
&vpx_highbd_tm_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
&vpx_highbd_tm_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+ &vpx_highbd_h_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+ &vpx_highbd_h_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+ &vpx_highbd_h_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+ &vpx_highbd_h_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@@ -527,6 +739,32 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2_TO_C_12, VP9HighbdIntraPredTest,
::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+ &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+ &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+ &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+ &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+ &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+ &vpx_highbd_d117_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+ &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+ &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+ &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+ &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+ &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+ &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+ &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -535,6 +773,14 @@ INSTANTIATE_TEST_CASE_P(
&vpx_highbd_dc_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
&vpx_highbd_dc_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+ &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+ &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+ &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+ &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
&vpx_highbd_tm_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -543,6 +789,14 @@ INSTANTIATE_TEST_CASE_P(
&vpx_highbd_tm_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
&vpx_highbd_tm_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+ &vpx_highbd_h_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+ &vpx_highbd_h_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+ &vpx_highbd_h_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+ &vpx_highbd_h_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
&vpx_highbd_v_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
diff --git a/libvpx/test/vp9_quantize_test.cc b/libvpx/test/vp9_quantize_test.cc
index 464389502..b18d4522c 100644
--- a/libvpx/test/vp9_quantize_test.cc
+++ b/libvpx/test/vp9_quantize_test.cc
@@ -14,9 +14,11 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
+#include "test/buffer.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
@@ -24,11 +26,12 @@
#include "vp9/common/vp9_scan.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
+#include "vpx_ports/vpx_timer.h"
using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
const int number_of_iterations = 100;
typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
@@ -38,307 +41,494 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
tran_low_t *dqcoeff, const int16_t *dequant,
uint16_t *eob, const int16_t *scan,
const int16_t *iscan);
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
+ int /*max_size*/, bool /*is_fp*/>
QuantizeParam;
-class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+// Wrapper for FP version which does not use zbin or quant_shift.
+typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
+ int skip_block, const int16_t *round,
+ const int16_t *quant, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, const int16_t *dequant,
+ uint16_t *eob, const int16_t *scan,
+ const int16_t *iscan);
+
+template <QuantizeFPFunc fn>
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
+ const int16_t *zbin, const int16_t *round,
+ const int16_t *quant, const int16_t *quant_shift,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ const int16_t *dequant, uint16_t *eob, const int16_t *scan,
+ const int16_t *iscan) {
+ (void)zbin;
+ (void)quant_shift;
+
+ fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob,
+ scan, iscan);
+}
+
+class VP9QuantizeBase {
public:
- virtual ~VP9QuantizeTest() {}
- virtual void SetUp() {
- quantize_op_ = GET_PARAM(0);
- ref_quantize_op_ = GET_PARAM(1);
- bit_depth_ = GET_PARAM(2);
- mask_ = (1 << bit_depth_) - 1;
+ VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
+ : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+ max_value_ = (1 << bit_depth_) - 1;
+ zbin_ptr_ =
+ reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
+ round_fp_ptr_ = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
+ quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
+ round_ptr_ =
+ reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
+ quant_ptr_ =
+ reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
+ quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
+ dequant_ptr_ = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
}
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
+ ~VP9QuantizeBase() {
+ vpx_free(zbin_ptr_);
+ vpx_free(round_fp_ptr_);
+ vpx_free(quant_fp_ptr_);
+ vpx_free(round_ptr_);
+ vpx_free(quant_ptr_);
+ vpx_free(quant_shift_ptr_);
+ vpx_free(dequant_ptr_);
+ zbin_ptr_ = NULL;
+ round_fp_ptr_ = NULL;
+ quant_fp_ptr_ = NULL;
+ round_ptr_ = NULL;
+ quant_ptr_ = NULL;
+ quant_shift_ptr_ = NULL;
+ dequant_ptr_ = NULL;
+ libvpx_test::ClearSystemState();
+ }
protected:
- vpx_bit_depth_t bit_depth_;
- int mask_;
- QuantizeFunc quantize_op_;
- QuantizeFunc ref_quantize_op_;
+ int16_t *zbin_ptr_;
+ int16_t *round_fp_ptr_;
+ int16_t *quant_fp_ptr_;
+ int16_t *round_ptr_;
+ int16_t *quant_ptr_;
+ int16_t *quant_shift_ptr_;
+ int16_t *dequant_ptr_;
+ const vpx_bit_depth_t bit_depth_;
+ int max_value_;
+ const int max_size_;
+ const bool is_fp_;
};
-class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
+class VP9QuantizeTest : public VP9QuantizeBase,
+ public ::testing::TestWithParam<QuantizeParam> {
public:
- virtual ~VP9Quantize32Test() {}
- virtual void SetUp() {
- quantize_op_ = GET_PARAM(0);
- ref_quantize_op_ = GET_PARAM(1);
- bit_depth_ = GET_PARAM(2);
- mask_ = (1 << bit_depth_) - 1;
- }
-
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
+ VP9QuantizeTest()
+ : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
+ quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
protected:
- vpx_bit_depth_t bit_depth_;
- int mask_;
- QuantizeFunc quantize_op_;
- QuantizeFunc ref_quantize_op_;
+ const QuantizeFunc quantize_op_;
+ const QuantizeFunc ref_quantize_op_;
};
-TEST_P(VP9QuantizeTest, OperationCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
- DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
- DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
- DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
- DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
- DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
- DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
- DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
- int err_count_total = 0;
- int first_failure = -1;
- for (int i = 0; i < number_of_iterations; ++i) {
- const int skip_block = i == 0;
- const TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
- const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
- const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
- const int count = (4 << sz) * (4 << sz); // 16, 64, 256
- int err_count = 0;
- *eob_ptr = rnd.Rand16();
- *ref_eob_ptr = *eob_ptr;
- for (int j = 0; j < count; j++) {
- coeff_ptr[j] = rnd.Rand16() & mask_;
- }
- for (int j = 0; j < 2; j++) {
- zbin_ptr[j] = rnd.Rand16() & mask_;
- round_ptr[j] = rnd.Rand16();
- quant_ptr[j] = rnd.Rand16();
- quant_shift_ptr[j] = rnd.Rand16();
- dequant_ptr[j] = rnd.Rand16();
+// This quantizer compares the AC coefficients to the quantization step size to
+// determine if further multiplication operations are needed.
+// Based on vp9_quantize_fp_sse2().
+void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ int i, eob = -1;
+ const int thr = dequant_ptr[1] >> 1;
+ (void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i += 16) {
+ int y;
+ int nzflag_cnt = 0;
+ int abs_coeff[16];
+ int coeff_sign[16];
+
+ // count nzflag for each row (16 tran_low_t)
+ for (y = 0; y < 16; ++y) {
+ const int rc = i + y;
+ const int coeff = coeff_ptr[rc];
+ coeff_sign[y] = (coeff >> 31);
+ abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
+ // The first 16 are skipped in the sse2 code. Do the same here to match.
+ if (i >= 16 && (abs_coeff[y] <= thr)) {
+ nzflag_cnt++;
+ }
}
- ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
- ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
- scan_order->scan, scan_order->iscan);
- ASM_REGISTER_STATE_CHECK(quantize_op_(
- coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
- quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
- scan_order->scan, scan_order->iscan));
- for (int j = 0; j < sz; ++j) {
- err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
- (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+
+ for (y = 0; y < 16; ++y) {
+ const int rc = i + y;
+ // If all of the AC coeffs in a row has magnitude less than the
+ // quantization step_size/2, quantize to zero.
+ if (nzflag_cnt < 16) {
+ int tmp =
+ clamp(abs_coeff[y] + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ } else {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
}
- err_count += (*ref_eob_ptr != *eob_ptr);
- if (err_count && !err_count_total) {
- first_failure = i;
+ }
+
+ // Scan for eob.
+ for (i = 0; i < n_coeffs; i++) {
+ // Use the scan order to find the correct eob.
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
}
- err_count_total += err_count;
}
- EXPECT_EQ(0, err_count_total)
- << "Error: Quantization Test, C output doesn't match SSE2 output. "
- << "First failed at test case " << first_failure;
+ *eob_ptr = eob + 1;
+}
+
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+ int16_t *quant, int16_t *quant_shift,
+ int16_t *dequant, int16_t *round_fp,
+ int16_t *quant_fp) {
+ // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
+ const int max_qrounding_factor_fp = 64;
+
+ for (int j = 0; j < 2; j++) {
+ // The range is 4 to 1828 in the VP9 tables.
+ const int qlookup = rnd->RandRange(1825) + 4;
+ round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
+ quant_fp[j] = (1 << 16) / qlookup;
+
+ // Values determined by deconstructing vp9_init_quantizer().
+ // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+ // values or U/V values of any bit depth. This is because y_delta is not
+ // factored into the vp9_ac_quant() call.
+ zbin[j] = rnd->RandRange(1200);
+
+ // round may be up to 685 for Y values or 914 for U/V.
+ round[j] = rnd->RandRange(914);
+ // quant ranges from 1 to -32703
+ quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+ // quant_shift goes up to 1 << 16.
+ quant_shift[j] = rnd->RandRange(16384);
+ // dequant maxes out at 1828 for all cases.
+ dequant[j] = rnd->RandRange(1828);
+ }
+ for (int j = 2; j < 8; j++) {
+ zbin[j] = zbin[1];
+ round_fp[j] = round_fp[1];
+ quant_fp[j] = quant_fp[1];
+ round[j] = round[1];
+ quant[j] = quant[1];
+ quant_shift[j] = quant_shift[1];
+ dequant[j] = dequant[1];
+ }
}
-TEST_P(VP9Quantize32Test, OperationCheck) {
+TEST_P(VP9QuantizeTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
- DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
- DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
- DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
- DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
- int err_count_total = 0;
- int first_failure = -1;
+ Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+ ASSERT_TRUE(coeff.Init());
+ Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(qcoeff.Init());
+ Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(dqcoeff.Init());
+ Buffer<tran_low_t> ref_qcoeff =
+ Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(ref_qcoeff.Init());
+ Buffer<tran_low_t> ref_dqcoeff =
+ Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(ref_dqcoeff.Init());
+ uint16_t eob, ref_eob;
+
for (int i = 0; i < number_of_iterations; ++i) {
- const int skip_block = i == 0;
- const TX_SIZE sz = TX_32X32;
- const TX_TYPE tx_type = (TX_TYPE)(i % 4);
- const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
- const int count = (4 << sz) * (4 << sz); // 1024
- int err_count = 0;
- *eob_ptr = rnd.Rand16();
- *ref_eob_ptr = *eob_ptr;
- for (int j = 0; j < count; j++) {
- coeff_ptr[j] = rnd.Rand16() & mask_;
- }
- for (int j = 0; j < 2; j++) {
- zbin_ptr[j] = rnd.Rand16() & mask_;
- round_ptr[j] = rnd.Rand16();
- quant_ptr[j] = rnd.Rand16();
- quant_shift_ptr[j] = rnd.Rand16();
- dequant_ptr[j] = rnd.Rand16();
+ // Test skip block for the first three iterations to catch all the different
+ // sizes.
+ const int skip_block = 0;
+ TX_SIZE sz;
+ if (max_size_ == 16) {
+ sz = static_cast<TX_SIZE>(i % 3); // TX_4X4, TX_8X8 TX_16X16
+ } else {
+ sz = TX_32X32;
}
- ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
- ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+ const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
+ const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+ const int count = (4 << sz) * (4 << sz);
+ coeff.Set(&rnd, -max_value_, max_value_);
+ GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+ quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+ quant_fp_ptr_);
+ int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+ int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+ ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+ q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+ ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
scan_order->scan, scan_order->iscan);
+
ASM_REGISTER_STATE_CHECK(quantize_op_(
- coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
- quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
- scan_order->scan, scan_order->iscan));
- for (int j = 0; j < sz; ++j) {
- err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
- (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
- }
- err_count += (*ref_eob_ptr != *eob_ptr);
- if (err_count && !err_count_total) {
- first_failure = i;
+ coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+ quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+ dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+
+ EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+ EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+ EXPECT_EQ(eob, ref_eob);
+
+ if (HasFailure()) {
+ printf("Failure on iteration %d.\n", i);
+ qcoeff.PrintDifference(ref_qcoeff);
+ dqcoeff.PrintDifference(ref_dqcoeff);
+ return;
}
- err_count_total += err_count;
}
- EXPECT_EQ(0, err_count_total)
- << "Error: Quantization Test, C output doesn't match SSE2 output. "
- << "First failed at test case " << first_failure;
}
TEST_P(VP9QuantizeTest, EOBCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
- DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
- DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
- DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
- DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
- DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
- DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
- DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
- int err_count_total = 0;
- int first_failure = -1;
+ Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+ ASSERT_TRUE(coeff.Init());
+ Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(qcoeff.Init());
+ Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(dqcoeff.Init());
+ Buffer<tran_low_t> ref_qcoeff =
+ Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(ref_qcoeff.Init());
+ Buffer<tran_low_t> ref_dqcoeff =
+ Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(ref_dqcoeff.Init());
+ uint16_t eob, ref_eob;
+
for (int i = 0; i < number_of_iterations; ++i) {
- int skip_block = i == 0;
- TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
- TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+ const int skip_block = 0;
+ TX_SIZE sz;
+ if (max_size_ == 16) {
+ sz = static_cast<TX_SIZE>(i % 3); // TX_4X4, TX_8X8 TX_16X16
+ } else {
+ sz = TX_32X32;
+ }
+ const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
- int count = (4 << sz) * (4 << sz); // 16, 64, 256
- int err_count = 0;
- *eob_ptr = rnd.Rand16();
- *ref_eob_ptr = *eob_ptr;
+ int count = (4 << sz) * (4 << sz);
// Two random entries
- for (int j = 0; j < count; j++) {
- coeff_ptr[j] = 0;
- }
- coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
- coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
- for (int j = 0; j < 2; j++) {
- zbin_ptr[j] = rnd.Rand16() & mask_;
- round_ptr[j] = rnd.Rand16();
- quant_ptr[j] = rnd.Rand16();
- quant_shift_ptr[j] = rnd.Rand16();
- dequant_ptr[j] = rnd.Rand16();
- }
-
- ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
- ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+ coeff.Set(0);
+ coeff.TopLeftPixel()[rnd(count)] =
+ static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+ coeff.TopLeftPixel()[rnd(count)] =
+ static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+ GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+ quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+ quant_fp_ptr_);
+ int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+ int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+ ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+ q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+ ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
scan_order->scan, scan_order->iscan);
+
ASM_REGISTER_STATE_CHECK(quantize_op_(
- coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
- quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
- scan_order->scan, scan_order->iscan));
+ coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+ quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+ dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
- for (int j = 0; j < sz; ++j) {
- err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
- (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
- }
- err_count += (*ref_eob_ptr != *eob_ptr);
- if (err_count && !err_count_total) {
- first_failure = i;
+ EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+ EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+ EXPECT_EQ(eob, ref_eob);
+
+ if (HasFailure()) {
+ printf("Failure on iteration %d.\n", i);
+ qcoeff.PrintDifference(ref_qcoeff);
+ dqcoeff.PrintDifference(ref_dqcoeff);
+ return;
}
- err_count_total += err_count;
}
- EXPECT_EQ(0, err_count_total)
- << "Error: Quantization Test, C output doesn't match SSE2 output. "
- << "First failed at test case " << first_failure;
}
-TEST_P(VP9Quantize32Test, EOBCheck) {
+TEST_P(VP9QuantizeTest, DISABLED_Speed) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
- DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
- DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
- DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
- DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
- DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
- int err_count_total = 0;
- int first_failure = -1;
- for (int i = 0; i < number_of_iterations; ++i) {
- int skip_block = i == 0;
- TX_SIZE sz = TX_32X32;
- TX_TYPE tx_type = (TX_TYPE)(i % 4);
- const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
- int count = (4 << sz) * (4 << sz); // 1024
- int err_count = 0;
- *eob_ptr = rnd.Rand16();
- *ref_eob_ptr = *eob_ptr;
- for (int j = 0; j < count; j++) {
- coeff_ptr[j] = 0;
- }
- // Two random entries
- coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
- coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
- for (int j = 0; j < 2; j++) {
- zbin_ptr[j] = rnd.Rand16() & mask_;
- round_ptr[j] = rnd.Rand16();
- quant_ptr[j] = rnd.Rand16();
- quant_shift_ptr[j] = rnd.Rand16();
- dequant_ptr[j] = rnd.Rand16();
- }
+ Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+ ASSERT_TRUE(coeff.Init());
+ Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(qcoeff.Init());
+ Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+ ASSERT_TRUE(dqcoeff.Init());
+ uint16_t eob;
+ TX_SIZE starting_sz, ending_sz;
- ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
- ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
- scan_order->scan, scan_order->iscan);
- ASM_REGISTER_STATE_CHECK(quantize_op_(
- coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
- quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
- scan_order->scan, scan_order->iscan));
+ if (max_size_ == 16) {
+ starting_sz = TX_4X4;
+ ending_sz = TX_16X16;
+ } else {
+ starting_sz = TX_32X32;
+ ending_sz = TX_32X32;
+ }
- for (int j = 0; j < sz; ++j) {
- err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
- (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
- }
- err_count += (*ref_eob_ptr != *eob_ptr);
- if (err_count && !err_count_total) {
- first_failure = i;
+ for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+ // zbin > coeff, zbin < coeff.
+ for (int i = 0; i < 2; ++i) {
+ const int skip_block = 0;
+ // TX_TYPE defines the scan order. That is not relevant to the speed test.
+ // Pick the first one.
+ const TX_TYPE tx_type = DCT_DCT;
+ const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+ const int count = (4 << sz) * (4 << sz);
+
+ GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+ quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+ quant_fp_ptr_);
+ int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+ int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+
+ if (i == 0) {
+ // When |coeff values| are less than zbin the results are 0.
+ int threshold = 100;
+ if (max_size_ == 32) {
+ // For 32x32, the threshold is halved. Double it to keep the values
+ // from clearing it.
+ threshold = 200;
+ }
+ for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+ coeff.Set(&rnd, -99, 99);
+ } else if (i == 1) {
+ for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+ coeff.Set(&rnd, -500, 500);
+ }
+
+ vpx_usec_timer timer;
+ vpx_usec_timer_start(&timer);
+ for (int j = 0; j < 100000000 / count; ++j) {
+ quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+ q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
+ dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
+ scan_order->scan, scan_order->iscan);
+ }
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ if (i == 0) printf("Bypass calculations.\n");
+ if (i == 1) printf("Full calculations.\n");
+ printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz,
+ elapsed_time / 1000);
}
- err_count_total += err_count;
+ printf("\n");
}
- EXPECT_EQ(0, err_count_total)
- << "Error: Quantization Test, C output doesn't match SSE2 output. "
- << "First failed at test case " << first_failure;
}
+
using std::tr1::make_tuple;
#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds.
+// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8),
INSTANTIATE_TEST_CASE_P(
SSE2, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2,
- &vpx_highbd_quantize_b_c, VPX_BITS_8),
- make_tuple(&vpx_highbd_quantize_b_sse2,
- &vpx_highbd_quantize_b_c, VPX_BITS_10),
- make_tuple(&vpx_highbd_quantize_b_sse2,
- &vpx_highbd_quantize_b_c, VPX_BITS_12)));
+ ::testing::Values(
+ make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+ VPX_BITS_10, 16, false),
+ make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+ VPX_BITS_12, 16, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+ make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+ &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+
+#else
INSTANTIATE_TEST_CASE_P(
- SSE2, VP9Quantize32Test,
- ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8),
- make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10),
- make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12)));
-#endif // HAVE_SSE2
+ SSE2, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+ &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+ 16, true)));
#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SSE2
+
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+ &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+ 16, true)));
+#else
+INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
+ &vpx_quantize_b_c,
+ VPX_BITS_8, 16, false)));
+#endif
+
+#if ARCH_X86_64
+// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
+INSTANTIATE_TEST_CASE_P(
+ DISABLED_SSSE3, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3,
+ &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+ false),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+ &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+ VPX_BITS_8, 32, true)));
+#endif // ARCH_X86_64
+#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or
+// highbitdepth configurations.
+#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ AVX, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ // Even though SSSE3 and AVX do not match the reference
+ // code, we can keep them in sync with each other.
+ make_tuple(&vpx_quantize_b_32x32_avx,
+ &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
+ false)));
+#endif // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+ VPX_BITS_8, 16, false),
+ make_tuple(&vpx_quantize_b_32x32_neon,
+ &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+ false),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+ &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+ 16, true),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+ &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+ VPX_BITS_8, 32, true)));
+#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+
+// Only useful to compare "Speed" test results.
+INSTANTIATE_TEST_CASE_P(
+ DISABLED_C, VP9QuantizeTest,
+ ::testing::Values(
+ make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+ make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
+ 32, false),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
+ &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+ make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
+ &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+ make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+ &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+ true)));
} // namespace
diff --git a/libvpx/test/vp9_scale_test.cc b/libvpx/test/vp9_scale_test.cc
new file mode 100644
index 000000000..5d7d38e89
--- /dev/null
+++ b/libvpx/test/vp9_scale_test.cc
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/yv12config.h"
+
+namespace libvpx_test {
+
+typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ INTERP_FILTER filter_type, int phase_scaler);
+
+class ScaleTest : public VpxScaleBase,
+ public ::testing::TestWithParam<ScaleFrameFunc> {
+ public:
+ virtual ~ScaleTest() {}
+
+ protected:
+ virtual void SetUp() { scale_fn_ = GetParam(); }
+
+ void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+ vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
+ }
+
+ void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+ ASM_REGISTER_STATE_CHECK(
+ scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
+ }
+
+ void RunTest() {
+ static const int kNumSizesToTest = 20;
+ static const int kNumScaleFactorsToTest = 4;
+ static const int kSizesToTest[] = {
+ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
+ 22, 24, 26, 28, 30, 32, 34, 68, 128, 134
+ };
+ static const int kScaleFactors[] = { 1, 2, 3, 4 };
+ for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+ for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+ for (int h = 0; h < kNumSizesToTest; ++h) {
+ const int src_height = kSizesToTest[h];
+ for (int w = 0; w < kNumSizesToTest; ++w) {
+ const int src_width = kSizesToTest[w];
+ for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+ ++sf_up_idx) {
+ const int sf_up = kScaleFactors[sf_up_idx];
+ for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+ ++sf_down_idx) {
+ const int sf_down = kScaleFactors[sf_down_idx];
+ const int dst_width = src_width * sf_up / sf_down;
+ const int dst_height = src_height * sf_up / sf_down;
+ if (sf_up == sf_down && sf_up != 1) {
+ continue;
+ }
+ // I420 frame width and height must be even.
+ if (!dst_width || !dst_height || dst_width & 1 ||
+ dst_height & 1) {
+ continue;
+ }
+ // vpx_convolve8_c() has restriction on the step which cannot
+ // exceed 64 (ratio 1 to 4).
+ if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+ continue;
+ }
+ ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
+ src_width, src_height, dst_width, dst_height));
+ ReferenceScaleFrame(filter_type, phase_scaler);
+ ScaleFrame(filter_type, phase_scaler);
+ if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+ ref_img_.frame_size)) {
+ printf(
+ "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+ "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+ "scale factor = %d:%d\n",
+ filter_type, phase_scaler, src_width, src_height,
+ dst_width, dst_height, sf_down, sf_up);
+ PrintDiff();
+ }
+ CompareImages(dst_img_);
+ DeallocScaleImages();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt,
+ const int stride, const int width, const int height,
+ const int plane_idx) const {
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ if (ref[y * stride + x] != opt[y * stride + x]) {
+ printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx,
+ y, x, ref[y * stride + x], opt[y * stride + x]);
+ break;
+ }
+ }
+ }
+ }
+
+ void PrintDiff() const {
+ assert(ref_img_.y_stride == dst_img_.y_stride);
+ assert(ref_img_.y_width == dst_img_.y_width);
+ assert(ref_img_.y_height == dst_img_.y_height);
+ assert(ref_img_.uv_stride == dst_img_.uv_stride);
+ assert(ref_img_.uv_width == dst_img_.uv_width);
+ assert(ref_img_.uv_height == dst_img_.uv_height);
+
+ if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+ ref_img_.frame_size)) {
+ PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer,
+ ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height,
+ 0);
+ PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer,
+ ref_img_.uv_stride, ref_img_.uv_width,
+ ref_img_.uv_height, 1);
+ PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer,
+ ref_img_.uv_stride, ref_img_.uv_width,
+ ref_img_.uv_height, 2);
+ }
+ }
+
+ ScaleFrameFunc scale_fn_;
+};
+
+TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
+
+TEST_P(ScaleTest, DISABLED_Speed) {
+ static const int kCountSpeedTestBlock = 100;
+ static const int kNumScaleFactorsToTest = 4;
+ static const int kScaleFactors[] = { 1, 2, 3, 4 };
+ const int src_width = 1280;
+ const int src_height = 720;
+ for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) {
+ for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) {
+ for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) {
+ const int sf_up = kScaleFactors[sf_up_idx];
+ for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+ ++sf_down_idx) {
+ const int sf_down = kScaleFactors[sf_down_idx];
+ const int dst_width = src_width * sf_up / sf_down;
+ const int dst_height = src_height * sf_up / sf_down;
+ if (sf_up == sf_down && sf_up != 1) {
+ continue;
+ }
+ // I420 frame width and height must be even.
+ if (dst_width & 1 || dst_height & 1) {
+ continue;
+ }
+ ASSERT_NO_FATAL_FAILURE(
+ ResetScaleImages(src_width, src_height, dst_width, dst_height));
+ ASM_REGISTER_STATE_CHECK(
+ ReferenceScaleFrame(filter_type, phase_scaler));
+
+ vpx_usec_timer timer;
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ ScaleFrame(filter_type, phase_scaler);
+ }
+ libvpx_test::ClearSystemState();
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time =
+ static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+ CompareImages(dst_img_);
+ DeallocScaleImages();
+
+ printf(
+ "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+ "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+ "scale factor = %d:%d, scale time: %5d ms\n",
+ filter_type, phase_scaler, src_width, src_height, dst_width,
+ dst_height, sf_down, sf_up, elapsed_time);
+ }
+ }
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(C, ScaleTest,
+ ::testing::Values(vp9_scale_and_extend_frame_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, ScaleTest,
+ ::testing::Values(vp9_scale_and_extend_frame_ssse3));
+#endif // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, ScaleTest,
+ ::testing::Values(vp9_scale_and_extend_frame_neon));
+#endif // HAVE_NEON
+
+} // namespace libvpx_test
diff --git a/libvpx/test/vp9_skip_loopfilter_test.cc b/libvpx/test/vp9_skip_loopfilter_test.cc
index e847bbddf..d41a784a2 100644
--- a/libvpx/test/vp9_skip_loopfilter_test.cc
+++ b/libvpx/test/vp9_skip_loopfilter_test.cc
@@ -85,8 +85,8 @@ class SkipLoopFilterTest {
// TODO(fgalligan): Move the MD5 testing code into another class.
void OpenMd5File(const std::string &md5_file_name) {
md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
- ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
- << md5_file_name;
+ ASSERT_TRUE(md5_file_ != NULL)
+ << "MD5 file open failed. Filename: " << md5_file_name;
}
// Reads the next line of the MD5 file.
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
index 19ed30431..62845ad61 100644
--- a/libvpx/test/vp9_subtract_test.cc
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -101,4 +101,9 @@ INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
::testing::Values(vpx_subtract_block_msa));
#endif
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest,
+ ::testing::Values(vpx_subtract_block_mmi));
+#endif
+
} // namespace vp9
diff --git a/libvpx/test/vp9_thread_test.cc b/libvpx/test/vp9_thread_test.cc
index 3e3fd25ac..576f5e906 100644
--- a/libvpx/test/vp9_thread_test.cc
+++ b/libvpx/test/vp9_thread_test.cc
@@ -187,8 +187,8 @@ void DecodeFiles(const FileList files[]) {
for (const FileList *iter = files; iter->name != NULL; ++iter) {
SCOPED_TRACE(iter->name);
for (int t = 1; t <= 8; ++t) {
- EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) << "threads = "
- << t;
+ EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
+ << "threads = " << t;
}
}
}
diff --git a/libvpx/test/vpx_scale_test.cc b/libvpx/test/vpx_scale_test.cc
index 9701d93da..ac75dceb2 100644
--- a/libvpx/test/vpx_scale_test.cc
+++ b/libvpx/test/vpx_scale_test.cc
@@ -14,149 +14,17 @@
#include "./vpx_scale_rtcd.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/yv12config.h"
-namespace {
+namespace libvpx_test {
typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
YV12_BUFFER_CONFIG *dst_ybf);
-class VpxScaleBase {
- public:
- virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
-
- void ResetImage(int width, int height) {
- width_ = width;
- height_ = height;
- memset(&img_, 0, sizeof(img_));
- ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
- VP8BORDERINPIXELS));
- memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
- FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
- img_.y_stride);
- FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
- img_.uv_stride);
- FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
- img_.uv_stride);
-
- memset(&ref_img_, 0, sizeof(ref_img_));
- ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
- VP8BORDERINPIXELS));
- memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
-
- memset(&cpy_img_, 0, sizeof(cpy_img_));
- ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
- VP8BORDERINPIXELS));
- memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
- ReferenceCopyFrame();
- }
-
- void DeallocImage() {
- vp8_yv12_de_alloc_frame_buffer(&img_);
- vp8_yv12_de_alloc_frame_buffer(&ref_img_);
- vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
- }
-
- protected:
- static const int kBufFiller = 123;
- static const int kBufMax = kBufFiller - 1;
-
- static void FillPlane(uint8_t *buf, int width, int height, int stride) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
- }
- }
- }
-
- static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
- int width, int height, int stride, int padding) {
- // Copy the outermost visible pixel to a distance of at least 'padding.'
- // The buffers are allocated such that there may be excess space outside the
- // padding. As long as the minimum amount of padding is achieved it is not
- // necessary to fill this space as well.
- uint8_t *left = buf - padding;
- uint8_t *right = buf + crop_width;
- const int right_extend = padding + (width - crop_width);
- const int bottom_extend = padding + (height - crop_height);
-
- // Fill the border pixels from the nearest image pixel.
- for (int y = 0; y < crop_height; ++y) {
- memset(left, left[padding], padding);
- memset(right, right[-1], right_extend);
- left += stride;
- right += stride;
- }
-
- left = buf - padding;
- uint8_t *top = left - (stride * padding);
- // The buffer does not always extend as far as the stride.
- // Equivalent to padding + width + padding.
- const int extend_width = padding + crop_width + right_extend;
-
- // The first row was already extended to the left and right. Copy it up.
- for (int y = 0; y < padding; ++y) {
- memcpy(top, left, extend_width);
- top += stride;
- }
-
- uint8_t *bottom = left + (crop_height * stride);
- for (int y = 0; y < bottom_extend; ++y) {
- memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
- bottom += stride;
- }
- }
-
- void ReferenceExtendBorder() {
- ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
- ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
- ref_img_.y_stride, ref_img_.border);
- ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
- ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
- ref_img_.uv_stride, ref_img_.border / 2);
- ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
- ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
- ref_img_.uv_stride, ref_img_.border / 2);
- }
-
- void ReferenceCopyFrame() {
- // Copy img_ to ref_img_ and extend frame borders. This will be used for
- // verifying extend_fn_ as well as copy_frame_fn_.
- EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
- for (int y = 0; y < img_.y_crop_height; ++y) {
- for (int x = 0; x < img_.y_crop_width; ++x) {
- ref_img_.y_buffer[x + y * ref_img_.y_stride] =
- img_.y_buffer[x + y * img_.y_stride];
- }
- }
-
- for (int y = 0; y < img_.uv_crop_height; ++y) {
- for (int x = 0; x < img_.uv_crop_width; ++x) {
- ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
- img_.u_buffer[x + y * img_.uv_stride];
- ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
- img_.v_buffer[x + y * img_.uv_stride];
- }
- }
-
- ReferenceExtendBorder();
- }
-
- void CompareImages(const YV12_BUFFER_CONFIG actual) {
- EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
- EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
- ref_img_.frame_size));
- }
-
- YV12_BUFFER_CONFIG img_;
- YV12_BUFFER_CONFIG ref_img_;
- YV12_BUFFER_CONFIG cpy_img_;
- int width_;
- int height_;
-};
-
class ExtendBorderTest
: public VpxScaleBase,
public ::testing::TestWithParam<ExtendFrameBorderFunc> {
@@ -178,11 +46,11 @@ class ExtendBorderTest
static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
for (int h = 0; h < kNumSizesToTest; ++h) {
for (int w = 0; w < kNumSizesToTest; ++w) {
- ASSERT_NO_FATAL_FAILURE(ResetImage(kSizesToTest[w], kSizesToTest[h]));
+ ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
+ ReferenceCopyFrame();
ExtendBorder();
- ReferenceExtendBorder();
CompareImages(img_);
- DeallocImage();
+ DeallocImages();
}
}
}
@@ -204,7 +72,7 @@ class CopyFrameTest : public VpxScaleBase,
virtual void SetUp() { copy_frame_fn_ = GetParam(); }
void CopyFrame() {
- ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+ ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_));
}
void RunTest() {
@@ -217,11 +85,11 @@ class CopyFrameTest : public VpxScaleBase,
static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
for (int h = 0; h < kNumSizesToTest; ++h) {
for (int w = 0; w < kNumSizesToTest; ++w) {
- ASSERT_NO_FATAL_FAILURE(ResetImage(kSizesToTest[w], kSizesToTest[h]));
+ ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
ReferenceCopyFrame();
CopyFrame();
- CompareImages(cpy_img_);
- DeallocImage();
+ CompareImages(dst_img_);
+ DeallocImages();
}
}
}
@@ -233,4 +101,5 @@ TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
::testing::Values(vp8_yv12_copy_frame_c));
-} // namespace
+
+} // namespace libvpx_test
diff --git a/libvpx/test/vpx_scale_test.h b/libvpx/test/vpx_scale_test.h
new file mode 100644
index 000000000..dcbd02b91
--- /dev/null
+++ b/libvpx/test/vpx_scale_test.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_VPX_SCALE_TEST_H_
+#define TEST_VPX_SCALE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+using libvpx_test::ACMRandom;
+
+namespace libvpx_test {
+
+class VpxScaleBase {
+ public:
+ virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
+
+ void ResetImage(YV12_BUFFER_CONFIG *const img, const int width,
+ const int height) {
+ memset(img, 0, sizeof(*img));
+ ASSERT_EQ(
+ 0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS));
+ memset(img->buffer_alloc, kBufFiller, img->frame_size);
+ }
+
+ void ResetImages(const int width, const int height) {
+ ResetImage(&img_, width, height);
+ ResetImage(&ref_img_, width, height);
+ ResetImage(&dst_img_, width, height);
+
+ FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+ img_.y_stride);
+ FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+ img_.uv_stride);
+ FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+ img_.uv_stride);
+ }
+
+ void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width,
+ const int height) {
+ memset(img, 0, sizeof(*img));
+#if CONFIG_VP9_HIGHBITDEPTH
+ ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0,
+ VP9_ENC_BORDER_IN_PIXELS, 0));
+#else
+ ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1,
+ VP9_ENC_BORDER_IN_PIXELS, 0));
+#endif
+ memset(img->buffer_alloc, kBufFiller, img->frame_size);
+ }
+
+ void ResetScaleImages(const int src_width, const int src_height,
+ const int dst_width, const int dst_height) {
+ ResetScaleImage(&img_, src_width, src_height);
+ ResetScaleImage(&ref_img_, dst_width, dst_height);
+ ResetScaleImage(&dst_img_, dst_width, dst_height);
+ FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+ img_.y_stride);
+ FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+ img_.uv_stride);
+ FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+ img_.uv_stride);
+ }
+
+ void DeallocImages() {
+ vp8_yv12_de_alloc_frame_buffer(&img_);
+ vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+ vp8_yv12_de_alloc_frame_buffer(&dst_img_);
+ }
+
+ void DeallocScaleImages() {
+ vpx_free_frame_buffer(&img_);
+ vpx_free_frame_buffer(&ref_img_);
+ vpx_free_frame_buffer(&dst_img_);
+ }
+
+ protected:
+ static const int kBufFiller = 123;
+ static const int kBufMax = kBufFiller - 1;
+
+ static void FillPlane(uint8_t *const buf, const int width, const int height,
+ const int stride) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
+ }
+ }
+ }
+
+ static void FillPlaneExtreme(uint8_t *const buf, const int width,
+ const int height, const int stride) {
+ ACMRandom rnd;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0;
+ }
+ }
+ }
+
+ static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
+ int width, int height, int stride, int padding) {
+ // Copy the outermost visible pixel to a distance of at least 'padding.'
+ // The buffers are allocated such that there may be excess space outside the
+ // padding. As long as the minimum amount of padding is achieved it is not
+ // necessary to fill this space as well.
+ uint8_t *left = buf - padding;
+ uint8_t *right = buf + crop_width;
+ const int right_extend = padding + (width - crop_width);
+ const int bottom_extend = padding + (height - crop_height);
+
+ // Fill the border pixels from the nearest image pixel.
+ for (int y = 0; y < crop_height; ++y) {
+ memset(left, left[padding], padding);
+ memset(right, right[-1], right_extend);
+ left += stride;
+ right += stride;
+ }
+
+ left = buf - padding;
+ uint8_t *top = left - (stride * padding);
+ // The buffer does not always extend as far as the stride.
+ // Equivalent to padding + width + padding.
+ const int extend_width = padding + crop_width + right_extend;
+
+ // The first row was already extended to the left and right. Copy it up.
+ for (int y = 0; y < padding; ++y) {
+ memcpy(top, left, extend_width);
+ top += stride;
+ }
+
+ uint8_t *bottom = left + (crop_height * stride);
+ for (int y = 0; y < bottom_extend; ++y) {
+ memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+ bottom += stride;
+ }
+ }
+
+ void ReferenceExtendBorder() {
+ ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
+ ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
+ ref_img_.y_stride, ref_img_.border);
+ ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
+ ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+ ref_img_.uv_stride, ref_img_.border / 2);
+ ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
+ ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+ ref_img_.uv_stride, ref_img_.border / 2);
+ }
+
+ void ReferenceCopyFrame() {
+ // Copy img_ to ref_img_ and extend frame borders. This will be used for
+ // verifying extend_fn_ as well as copy_frame_fn_.
+ EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+ for (int y = 0; y < img_.y_crop_height; ++y) {
+ for (int x = 0; x < img_.y_crop_width; ++x) {
+ ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+ img_.y_buffer[x + y * img_.y_stride];
+ }
+ }
+
+ for (int y = 0; y < img_.uv_crop_height; ++y) {
+ for (int x = 0; x < img_.uv_crop_width; ++x) {
+ ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+ img_.u_buffer[x + y * img_.uv_stride];
+ ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+ img_.v_buffer[x + y * img_.uv_stride];
+ }
+ }
+
+ ReferenceExtendBorder();
+ }
+
+ void CompareImages(const YV12_BUFFER_CONFIG actual) {
+ EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+ EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+ ref_img_.frame_size));
+ }
+
+ YV12_BUFFER_CONFIG img_;
+ YV12_BUFFER_CONFIG ref_img_;
+ YV12_BUFFER_CONFIG dst_img_;
+};
+
+} // namespace libvpx_test
+
+#endif // TEST_VPX_SCALE_TEST_H_
diff --git a/libvpx/test/vpx_temporal_svc_encoder.sh b/libvpx/test/vpx_temporal_svc_encoder.sh
index 3d5152ae3..56a7902f4 100755
--- a/libvpx/test/vpx_temporal_svc_encoder.sh
+++ b/libvpx/test/vpx_temporal_svc_encoder.sh
@@ -52,11 +52,19 @@ vpx_tsvc_encoder() {
# TODO(tomfinegan): Verify file output for all thread runs.
for threads in $(seq $max_threads); do
- eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
- "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
- "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
- "${error_resilient}" "${threads}" "$@" \
- ${devnull}
+ if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+ "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+ "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+ "$@" ${devnull}
+ else
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+ "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+ "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+ "$@" "8" ${devnull}
+ fi
done
}
diff --git a/libvpx/test/vpxenc.sh b/libvpx/test/vpxenc.sh
index e8994992a..0c160dafc 100755
--- a/libvpx/test/vpxenc.sh
+++ b/libvpx/test/vpxenc.sh
@@ -90,6 +90,15 @@ vpxenc_rt_params() {
--undershoot-pct=50"
}
+# Forces --passes to 1 with CONFIG_REALTIME_ONLY.
+vpxenc_passes_param() {
+ if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then
+ echo "--passes=1"
+ else
+ echo "--passes=2"
+ fi
+}
+
# Wrapper function for running vpxenc with pipe input. Requires that
# LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
# input file path and shifted away. All remaining parameters are passed through
@@ -218,9 +227,11 @@ vpxenc_vp8_ivf_piped_input() {
vpxenc_vp9_ivf() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+ local readonly passes=$(vpxenc_passes_param)
vpxenc $(yuv_input_hantro_collage) \
--codec=vp9 \
--limit="${TEST_FRAMES}" \
+ "${passes}" \
--ivf \
--output="${output}"
@@ -235,9 +246,11 @@ vpxenc_vp9_webm() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+ local readonly passes=$(vpxenc_passes_param)
vpxenc $(yuv_input_hantro_collage) \
--codec=vp9 \
--limit="${TEST_FRAMES}" \
+ "${passes}" \
--output="${output}"
if [ ! -e "${output}" ]; then
@@ -339,11 +352,13 @@ vpxenc_vp9_webm_2pass() {
vpxenc_vp9_ivf_lossless() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+ local readonly passes=$(vpxenc_passes_param)
vpxenc $(yuv_input_hantro_collage) \
--codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
--output="${output}" \
+ "${passes}" \
--lossless=1
if [ ! -e "${output}" ]; then
@@ -356,11 +371,13 @@ vpxenc_vp9_ivf_lossless() {
vpxenc_vp9_ivf_minq0_maxq0() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+ local readonly passes=$(vpxenc_passes_param)
vpxenc $(yuv_input_hantro_collage) \
--codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
--output="${output}" \
+ "${passes}" \
--min-q=0 \
--max-q=0
@@ -377,12 +394,13 @@ vpxenc_vp9_webm_lag10_frames20() {
local readonly lag_total_frames=20
local readonly lag_frames=10
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+ local readonly passes=$(vpxenc_passes_param)
vpxenc $(yuv_input_hantro_collage) \
--codec=vp9 \
--limit="${lag_total_frames}" \
--lag-in-frames="${lag_frames}" \
--output="${output}" \
- --passes=2 \
+ "${passes}" \
--auto-alt-ref=1
if [ ! -e "${output}" ]; then
@@ -397,9 +415,11 @@ vpxenc_vp9_webm_non_square_par() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+ local readonly passes=$(vpxenc_passes_param)
vpxenc $(y4m_input_non_square_par) \
--codec=vp9 \
--limit="${TEST_FRAMES}" \
+ "${passes}" \
--output="${output}"
if [ ! -e "${output}" ]; then
@@ -412,18 +432,21 @@ vpxenc_vp9_webm_non_square_par() {
vpxenc_tests="vpxenc_vp8_ivf
vpxenc_vp8_webm
vpxenc_vp8_webm_rt
- vpxenc_vp8_webm_2pass
- vpxenc_vp8_webm_lag10_frames20
vpxenc_vp8_ivf_piped_input
vpxenc_vp9_ivf
vpxenc_vp9_webm
vpxenc_vp9_webm_rt
vpxenc_vp9_webm_rt_multithread_tiled
vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
- vpxenc_vp9_webm_2pass
vpxenc_vp9_ivf_lossless
vpxenc_vp9_ivf_minq0_maxq0
vpxenc_vp9_webm_lag10_frames20
vpxenc_vp9_webm_non_square_par"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+ vpxenc_tests="$vpxenc_tests
+ vpxenc_vp8_webm_2pass
+ vpxenc_vp8_webm_lag10_frames20
+ vpxenc_vp9_webm_2pass"
+fi
run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h
index 53713618e..09c007a3f 100644
--- a/libvpx/test/webm_video_source.h
+++ b/libvpx/test/webm_video_source.h
@@ -40,8 +40,8 @@ class WebMVideoSource : public CompressedVideoSource {
virtual void Begin() {
vpx_ctx_->file = OpenTestDataFile(file_name_);
- ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: "
- << file_name_;
+ ASSERT_TRUE(vpx_ctx_->file != NULL)
+ << "Input file open failed. Filename: " << file_name_;
ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
diff --git a/libvpx/test/y4m_video_source.h b/libvpx/test/y4m_video_source.h
index 2682ddde3..1301f6970 100644
--- a/libvpx/test/y4m_video_source.h
+++ b/libvpx/test/y4m_video_source.h
@@ -34,8 +34,8 @@ class Y4mVideoSource : public VideoSource {
virtual void OpenSource() {
CloseSource();
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
- << file_name_;
+ ASSERT_TRUE(input_file_ != NULL)
+ << "Input file open failed. Filename: " << file_name_;
}
virtual void ReadSourceToStart() {
diff --git a/libvpx/test/yuv_video_source.h b/libvpx/test/yuv_video_source.h
index 71ad2ab9a..aee6b2ffb 100644
--- a/libvpx/test/yuv_video_source.h
+++ b/libvpx/test/yuv_video_source.h
@@ -43,8 +43,8 @@ class YUVVideoSource : public VideoSource {
virtual void Begin() {
if (input_file_) fclose(input_file_);
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
- << file_name_;
+ ASSERT_TRUE(input_file_ != NULL)
+ << "Input file open failed. Filename: " << file_name_;
if (start_) {
fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
}
diff --git a/libvpx/third_party/googletest/README.libvpx b/libvpx/third_party/googletest/README.libvpx
index 3d9938096..2cd6910b4 100644
--- a/libvpx/third_party/googletest/README.libvpx
+++ b/libvpx/third_party/googletest/README.libvpx
@@ -20,3 +20,5 @@ Local Modifications:
LICENSE
README.md
src
+- Suppress unsigned overflow instrumentation in the LCG
+ https://github.com/google/googletest/pull/1066
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 0094ed507..da57e65d3 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -985,6 +985,19 @@ using ::std::tuple_size;
# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
#endif // __clang__
+// A function level attribute to disable UndefinedBehaviorSanitizer's (defined)
+// unsigned integer overflow instrumentation.
+#if defined(__clang__)
+# if defined(__has_attribute) && __has_attribute(no_sanitize)
+# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \
+ __attribute__((no_sanitize("unsigned-integer-overflow")))
+# else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+# endif // defined(__has_attribute) && __has_attribute(no_sanitize)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+#endif // __clang__
+
namespace testing {
class Message;
diff --git a/libvpx/third_party/googletest/src/src/gtest.cc b/libvpx/third_party/googletest/src/src/gtest.cc
index d882ab2e3..5a8932c73 100644
--- a/libvpx/third_party/googletest/src/src/gtest.cc
+++ b/libvpx/third_party/googletest/src/src/gtest.cc
@@ -308,6 +308,7 @@ namespace internal {
// Generates a random number from [0, range), using a Linear
// Congruential Generator (LCG). Crashes if 'range' is 0 or greater
// than kMaxRange.
+GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
UInt32 Random::Generate(UInt32 range) {
// These constants are the same as are used in glibc's rand(3).
state_ = (1103515245U*state_ + 12345U) % kMaxRange;
diff --git a/libvpx/third_party/libwebm/README.libvpx b/libvpx/third_party/libwebm/README.libvpx
index 1f8a13d78..ebb5ff2f4 100644
--- a/libvpx/third_party/libwebm/README.libvpx
+++ b/libvpx/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
URL: https://chromium.googlesource.com/webm/libwebm
-Version: 9732ae991efb71aced4267d4794918279e362d99
+Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74
License: BSD
License File: LICENSE.txt
diff --git a/libvpx/third_party/libwebm/common/hdr_util.h b/libvpx/third_party/libwebm/common/hdr_util.h
index 689fb30a3..3ef5388fd 100644
--- a/libvpx/third_party/libwebm/common/hdr_util.h
+++ b/libvpx/third_party/libwebm/common/hdr_util.h
@@ -47,7 +47,15 @@ struct Vp9CodecFeatures {
int chroma_subsampling;
};
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic pop
+#endif
bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
PrimaryChromaticityPtr* muxer_pc);
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 299b45c98..15b9a908d 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -24,6 +24,11 @@
#include "mkvmuxer/mkvwriter.h"
#include "mkvparser/mkvparser.h"
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
namespace mkvmuxer {
const float PrimaryChromaticity::kChromaticityMin = 0.0f;
@@ -3053,7 +3058,7 @@ Segment::Segment()
output_cues_(true),
accurate_cluster_duration_(false),
fixed_size_cluster_timecode_(false),
- estimate_file_duration_(true),
+ estimate_file_duration_(false),
payload_pos_(0),
size_position_(0),
doc_type_version_(kDefaultDocTypeVersion),
@@ -3361,7 +3366,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
track->set_width(width);
track->set_height(height);
- tracks_.AddTrack(track, number);
+ if (!tracks_.AddTrack(track, number)) {
+ delete track;
+ return 0;
+ }
has_video_ = true;
return track->number();
@@ -3383,8 +3391,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
cue->set_block_number(cluster->blocks_added());
cue->set_cluster_pos(cluster->position_for_cues());
cue->set_track(track);
- if (!cues_.AddCue(cue))
+ if (!cues_.AddCue(cue)) {
+ delete cue;
return false;
+ }
new_cuepoint_ = false;
return true;
@@ -3401,7 +3411,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
track->set_sample_rate(sample_rate);
track->set_channels(channels);
- tracks_.AddTrack(track, number);
+ if (!tracks_.AddTrack(track, number)) {
+ delete track;
+ return 0;
+ }
return track->number();
}
@@ -3490,16 +3503,33 @@ bool Segment::AddGenericFrame(const Frame* frame) {
if (frame->discard_padding() != 0)
doc_type_version_ = 4;
+ if (cluster_list_size_ > 0) {
+ const uint64_t timecode_scale = segment_info_.timecode_scale();
+ const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+ const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+ const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+ const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+ if (rel_timecode > kMaxBlockTimecode) {
+ force_new_cluster_ = true;
+ }
+ }
+
// If the segment has a video track hold onto audio frames to make sure the
// audio that is associated with the start time of a video key-frame is
// muxed into the same cluster.
if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
!force_new_cluster_) {
Frame* const new_frame = new (std::nothrow) Frame();
- if (!new_frame || !new_frame->CopyFrom(*frame))
+ if (!new_frame || !new_frame->CopyFrom(*frame)) {
+ delete new_frame;
return false;
- if (!QueueFrame(new_frame))
+ }
+ if (!QueueFrame(new_frame)) {
+ delete new_frame;
return false;
+ }
track_frames_written_[frame->track_number() - 1]++;
return true;
}
@@ -3522,8 +3552,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
!frame->reference_block_timestamp_set()) {
Frame* const new_frame = new (std::nothrow) Frame();
- if (!new_frame->CopyFrom(*frame))
+ if (!new_frame || !new_frame->CopyFrom(*frame)) {
+ delete new_frame;
return false;
+ }
new_frame->set_reference_block_timestamp(
last_track_timestamp_[frame->track_number() - 1]);
frame = new_frame;
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 1ba17ac1b..355d4e22b 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -10,6 +10,7 @@
#ifdef __ANDROID__
#include <fcntl.h>
+#include <unistd.h>
#endif
#include <cassert>
@@ -288,7 +289,7 @@ uint64 EbmlElementSize(uint64 type, const char* value) {
ebml_size += strlen(value);
// Size of Datasize
- ebml_size++;
+ ebml_size += GetCodedUIntSize(strlen(value));
return ebml_size;
}
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
index ec34e4df8..84655d802 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -8,6 +8,8 @@
#include "mkvmuxer/mkvwriter.h"
+#include <sys/types.h>
+
#ifdef _MSC_VER
#include <share.h> // for _SH_DENYWR
#endif
diff --git a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
index e62d6f607..37f230d0a 100644
--- a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
@@ -22,6 +22,11 @@
#include "common/webmids.h"
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
namespace mkvparser {
const float MasteringMetadata::kValueNotPresent = FLT_MAX;
const long long Colour::kValueNotPresent = LLONG_MAX;
@@ -1528,15 +1533,19 @@ long SeekHead::Parse() {
if (pos != stop)
return E_FILE_FORMAT_INVALID;
- m_entries = new (std::nothrow) Entry[entry_count];
+ if (entry_count > 0) {
+ m_entries = new (std::nothrow) Entry[entry_count];
- if (m_entries == NULL)
- return -1;
+ if (m_entries == NULL)
+ return -1;
+ }
- m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+ if (void_element_count > 0) {
+ m_void_elements = new (std::nothrow) VoidElement[void_element_count];
- if (m_void_elements == NULL)
- return -1;
+ if (m_void_elements == NULL)
+ return -1;
+ }
// now parse the entries and void elements
@@ -1555,14 +1564,14 @@ long SeekHead::Parse() {
if (status < 0) // error
return status;
- if (id == libwebm::kMkvSeek) {
+ if (id == libwebm::kMkvSeek && entry_count > 0) {
if (ParseEntry(pReader, pos, size, pEntry)) {
Entry& e = *pEntry++;
e.element_start = idpos;
e.element_size = (pos + size) - idpos;
}
- } else if (id == libwebm::kMkvVoid) {
+ } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
VoidElement& e = *pVoidElement++;
e.element_start = idpos;
@@ -2426,7 +2435,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
}
const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
- assert(pTrack);
+ if (pTrack == NULL) {
+ return NULL;
+ }
const long long n = pTrack->GetNumber();
@@ -4026,7 +4037,7 @@ long SegmentInfo::Parse() {
}
const double rollover_check = m_duration * m_timecodeScale;
- if (rollover_check > LLONG_MAX)
+ if (rollover_check > static_cast<double>(LLONG_MAX))
return E_FILE_FORMAT_INVALID;
if (pos != stop)
@@ -4975,29 +4986,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
if (!reader)
return false;
- std::auto_ptr<PrimaryChromaticity> chromaticity_ptr;
-
- if (!*chromaticity) {
- chromaticity_ptr.reset(new PrimaryChromaticity());
- } else {
- chromaticity_ptr.reset(*chromaticity);
- }
+ if (!*chromaticity)
+ *chromaticity = new PrimaryChromaticity();
- if (!chromaticity_ptr.get())
+ if (!*chromaticity)
return false;
- float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y;
+ PrimaryChromaticity* pc = *chromaticity;
+ float* value = is_x ? &pc->x : &pc->y;
double parser_value = 0;
- const long long value_parse_status =
+ const long long parse_status =
UnserializeFloat(reader, read_pos, value_size, parser_value);
- *value = static_cast<float>(parser_value);
-
- if (value_parse_status < 0 || *value < 0.0 || *value > 1.0)
+ // Valid range is [0, 1]. Make sure the double is representable as a float
+ // before casting.
+ if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+ (parser_value > 0.0 && parser_value < FLT_MIN))
return false;
- *chromaticity = chromaticity_ptr.release();
+ *value = static_cast<float>(parser_value);
+
return true;
}
@@ -5228,7 +5237,9 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
double value = 0;
const long long value_parse_status =
UnserializeFloat(reader, read_pos, child_size, value);
- if (value_parse_status < 0) {
+ // Make sure value is representable as a float before casting.
+ if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+ (value > 0.0 && value < FLT_MIN)) {
return false;
}
@@ -7932,7 +7943,6 @@ long Block::Parse(const Cluster* pCluster) {
pf = m_frames;
while (pf != pf_end) {
Frame& f = *pf++;
- assert((pos + f.len) <= stop);
if ((pos + f.len) > stop)
return E_FILE_FORMAT_INVALID;
diff --git a/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
index b8fd00c26..23d68f508 100644
--- a/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
@@ -7,6 +7,8 @@
// be found in the AUTHORS file in the root of the source tree.
#include "mkvparser/mkvreader.h"
+#include <sys/types.h>
+
#include <cassert>
namespace mkvparser {
diff --git a/libvpx/tools.mk b/libvpx/tools.mk
index 23adcee6e..1d005b2ac 100644
--- a/libvpx/tools.mk
+++ b/libvpx/tools.mk
@@ -13,6 +13,8 @@ TOOLS-yes += tiny_ssim.c
tiny_ssim.SRCS += vpx/vpx_integer.h y4minput.c y4minput.h \
vpx/vpx_codec.h vpx/src/vpx_image.c
tiny_ssim.SRCS += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h
+tiny_ssim.SRCS += vpx_dsp/ssim.h vpx_scale/yv12config.h
+tiny_ssim.SRCS += vpx_ports/mem.h vpx_ports/mem.h
tiny_ssim.SRCS += vpx_mem/include/vpx_mem_intrnl.h
tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
diff --git a/libvpx/tools/all_builds.py b/libvpx/tools/all_builds.py
deleted file mode 100755
index d1f0c80c0..000000000
--- a/libvpx/tools/all_builds.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/python
-
-import getopt
-import subprocess
-import sys
-
-LONG_OPTIONS = ["shard=", "shards="]
-BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
-
-def RunCommand(command):
- run = subprocess.Popen(command, shell=True)
- output = run.communicate()
- if run.returncode:
- print "Non-zero return code: " + str(run.returncode) + " => exiting!"
- sys.exit(1)
-
-def list_of_experiments():
- experiments = []
- configure_file = open("configure")
- list_start = False
- for line in configure_file.read().split("\n"):
- if line == 'EXPERIMENT_LIST="':
- list_start = True
- elif line == '"':
- list_start = False
- elif list_start:
- currently_broken = ["csm"]
- experiment = line[4:]
- if experiment not in currently_broken:
- experiments.append(experiment)
- return experiments
-
-def main(argv):
- # Parse arguments
- options = {"--shard": 0, "--shards": 1}
- if "--" in argv:
- opt_end_index = argv.index("--")
- else:
- opt_end_index = len(argv)
- try:
- o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
- except getopt.GetoptError, err:
- print str(err)
- print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
- sys.exit(2)
-
- options.update(o)
- extra_args = argv[opt_end_index + 1:]
-
- # Shard experiment list
- shard = int(options["--shard"])
- shards = int(options["--shards"])
- experiments = list_of_experiments()
- base_command = " ".join([BASE_COMMAND] + extra_args)
- configs = [base_command]
- configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
- my_configs = zip(configs, range(len(configs)))
- my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
- my_configs = [e[0] for e in my_configs]
-
- # Run configs for this shard
- for config in my_configs:
- test_build(config)
-
-def test_build(configure_command):
- print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
- RunCommand(configure_command)
- RunCommand("make clean")
- RunCommand("make")
-
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/libvpx/tools/author_first_release.sh b/libvpx/tools/author_first_release.sh
deleted file mode 100755
index 7b0b79721..000000000
--- a/libvpx/tools/author_first_release.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-##
-## List the release each author first contributed to.
-##
-## Usage: author_first_release.sh [TAGS]
-##
-## If the TAGS arguments are unspecified, all tags reported by `git tag`
-## will be considered.
-##
-tags=${@:-$(git tag)}
-for tag in $tags; do
- git shortlog -n -e -s $tag |
- cut -f2- |
- awk "{print \"${tag#v}\t\"\$0}"
-done | sort -k2 | uniq -f2
diff --git a/libvpx/tools/ftfy.sh b/libvpx/tools/ftfy.sh
deleted file mode 100755
index c005918fe..000000000
--- a/libvpx/tools/ftfy.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/bin/sh
-self="$0"
-dirname_self=$(dirname "$self")
-
-usage() {
- cat <<EOF >&2
-Usage: $self [option]
-
-This script applies a whitespace transformation to the commit at HEAD. If no
-options are given, then the modified files are left in the working tree.
-
-Options:
- -h, --help Shows this message
- -n, --dry-run Shows a diff of the changes to be made.
- --amend Squashes the changes into the commit at HEAD
- This option will also reformat the commit message.
- --commit Creates a new commit containing only the whitespace changes
- --msg-only Reformat the commit message only, ignore the patch itself.
-
-EOF
- rm -f ${CLEAN_FILES}
- exit 1
-}
-
-
-log() {
- echo "${self##*/}: $@" >&2
-}
-
-
-vpx_style() {
- for f; do
- case "$f" in
- *.h|*.c|*.cc)
- clang-format -i --style=file "$f"
- ;;
- esac
- done
-}
-
-
-apply() {
- [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
-}
-
-
-commit() {
- LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
- if [ -z "$LAST_CHANGEID" ]; then
- log "HEAD doesn't have a Change-Id, unable to generate a new commit"
- exit 1
- fi
-
- # Build a deterministic Change-Id from the parent's
- NEW_CHANGEID=${LAST_CHANGEID}-styled
- NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
-
- # Commit, preserving authorship from the parent commit.
- git commit -a -C HEAD > /dev/null
- git commit --amend -F- << EOF
-Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
-
-Change-Id: ${NEW_CHANGEID}
-EOF
-}
-
-
-show_commit_msg_diff() {
- if [ $DIFF_MSG_RESULT -ne 0 ]; then
- log "Modified commit message:"
- diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
- fi
-}
-
-
-amend() {
- show_commit_msg_diff
- if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
- git commit -a --amend -F "$NEW_COMMIT_MSG"
- fi
-}
-
-
-diff_msg() {
- git log -1 --format=%B > "$ORIG_COMMIT_MSG"
- "${dirname_self}"/wrap-commit-msg.py \
- < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
- cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
- DIFF_MSG_RESULT=$?
-}
-
-
-# Temporary files
-ORIG_DIFF=orig.diff.$$
-MODIFIED_DIFF=modified.diff.$$
-FINAL_DIFF=final.diff.$$
-ORIG_COMMIT_MSG=orig.commit-msg.$$
-NEW_COMMIT_MSG=new.commit-msg.$$
-CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
-CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
-
-# Preconditions
-[ $# -lt 2 ] || usage
-
-if ! clang-format -version >/dev/null 2>&1; then
- log "clang-format not found"
- exit 1
-fi
-
-if ! git diff --quiet HEAD; then
- log "Working tree is dirty, commit your changes first"
- exit 1
-fi
-
-# Need to be in the root
-cd "$(git rev-parse --show-toplevel)"
-
-# Collect the original diff
-git show > "${ORIG_DIFF}"
-
-# Apply the style guide on new and modified files and collect its diff
-for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
- case "$f" in
- third_party/*) continue;;
- esac
- vpx_style "$f"
-done
-git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
-
-# Intersect the two diffs
-"${dirname_self}"/intersect-diffs.py \
- "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
-INTERSECT_RESULT=$?
-git reset --hard >/dev/null
-
-# Fixup the commit message
-diff_msg
-
-# Handle options
-if [ -n "$1" ]; then
- case "$1" in
- -h|--help) usage;;
- -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
- --commit) apply "${FINAL_DIFF}"; commit;;
- --amend) apply "${FINAL_DIFF}"; amend;;
- --msg-only) amend;;
- *) usage;;
- esac
-else
- apply "${FINAL_DIFF}"
- if ! git diff --quiet; then
- log "Formatting changes applied, verify and commit."
- log "See also: http://www.webmproject.org/code/contribute/conventions/"
- git diff --stat
- fi
-fi
-
-rm -f ${CLEAN_FILES}
diff --git a/libvpx/tools/tiny_ssim.c b/libvpx/tools/tiny_ssim.c
index 1f6a448bc..5e8ca02b4 100644
--- a/libvpx/tools/tiny_ssim.c
+++ b/libvpx/tools/tiny_ssim.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <errno.h>
#include <math.h>
#include <stdio.h>
@@ -16,73 +17,36 @@
#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
#include "./y4minput.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
+
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride,
+ uint16_t *recon, int recon_stride,
+ unsigned int cols, unsigned int rows) {
+ unsigned int row, col;
+ uint64_t total_sse = 0;
+ int diff;
-void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
- uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
- uint32_t *sum_sq_r, uint32_t *sum_sxr) {
- int i, j;
- for (i = 0; i < 8; i++, s += sp, r += rp) {
- for (j = 0; j < 8; j++) {
- *sum_s += s[j];
- *sum_r += r[j];
- *sum_sq_s += s[j] * s[j];
- *sum_sq_r += r[j] * r[j];
- *sum_sxr += s[j] * r[j];
+ for (row = 0; row < rows; row++) {
+ for (col = 0; col < cols; col++) {
+ diff = orig[col] - recon[col];
+ total_sse += diff * diff;
}
- }
-}
-
-static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
-static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
- uint32_t sum_sq_r, uint32_t sum_sxr, int count) {
- int64_t ssim_n, ssim_d;
- int64_t c1, c2;
-
- // scale the constants by number of pixels
- c1 = (cc1 * count * count) >> 12;
- c2 = (cc2 * count * count) >> 12;
-
- ssim_n = (2 * sum_s * sum_r + c1) *
- ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
-
- ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
- ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
- (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
-
- return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
- uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
- vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
- &sum_sxr);
- return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
- int stride_img2, int width, int height) {
- int i, j;
- int samples = 0;
- double ssim_total = 0;
-
- // sample point start with each 4x4 location
- for (i = 0; i <= height - 8;
- i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
- for (j = 0; j <= width - 8; j += 4) {
- double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
- ssim_total += v;
- samples++;
- }
+ orig += orig_stride;
+ recon += recon_stride;
}
- ssim_total /= samples;
- return ssim_total;
+ return total_sse;
}
-
+#endif
static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
int recon_stride, unsigned int cols,
unsigned int rows) {
@@ -103,7 +67,7 @@ static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
}
#define MAX_PSNR 100
-double vp9_mse2psnr(double samples, double peak, double mse) {
+static double mse2psnr(double samples, double peak, double mse) {
double psnr;
if (mse > 0.0)
@@ -126,10 +90,12 @@ typedef struct input_file {
vpx_image_t img;
int w;
int h;
+ int bit_depth;
} input_file_t;
// Open a file and determine if its y4m or raw. If y4m get the header.
-int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
+static int open_input_file(const char *file_name, input_file_t *input, int w,
+ int h, int bit_depth) {
char y4m_buf[4];
size_t r1;
input->type = RAW_YUV;
@@ -144,6 +110,7 @@ int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0);
input->w = input->y4m.pic_w;
input->h = input->y4m.pic_h;
+ input->bit_depth = input->y4m.bit_depth;
// Y4M alloc's its own buf. Init this to avoid problems if we never
// read frames.
memset(&input->img, 0, sizeof(input->img));
@@ -152,14 +119,17 @@ int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
fseek(input->file, 0, SEEK_SET);
input->w = w;
input->h = h;
- input->buf = malloc(w * h * 3 / 2);
+ if (bit_depth < 9)
+ input->buf = malloc(w * h * 3 / 2);
+ else
+ input->buf = malloc(w * h * 3);
break;
}
}
return 0;
}
-void close_input_file(input_file_t *in) {
+static void close_input_file(input_file_t *in) {
if (in->file) fclose(in->file);
if (in->type == Y4M) {
vpx_img_free(&in->img);
@@ -168,8 +138,8 @@ void close_input_file(input_file_t *in) {
}
}
-size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u,
- unsigned char **v) {
+static size_t read_input_file(input_file_t *in, unsigned char **y,
+ unsigned char **u, unsigned char **v, int bd) {
size_t r1 = 0;
switch (in->type) {
case Y4M:
@@ -179,18 +149,429 @@ size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u,
*v = in->img.planes[2];
break;
case RAW_YUV:
- r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file);
- *y = in->buf;
- *u = in->buf + in->w * in->h;
- *v = in->buf + 5 * in->w * in->h / 4;
+ if (bd < 9) {
+ r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file);
+ *y = in->buf;
+ *u = in->buf + in->w * in->h;
+ *v = in->buf + 5 * in->w * in->h / 4;
+ } else {
+ r1 = fread(in->buf, in->w * in->h * 3, 1, in->file);
+ *y = in->buf;
+ *u = in->buf + in->w * in->h / 2;
+ *v = *u + in->w * in->h / 2;
+ }
break;
}
return r1;
}
+void ssim_parms_16x16(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 16; i++, s += sp, r += rp) {
+ for (j = 0; j < 16; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+ uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+ uint32_t bd) {
+ int64_t ssim_n, ssim_d;
+ int64_t c1 = 0, c2 = 0;
+ if (bd == 8) {
+ // scale the constants by number of pixels
+ c1 = (cc1 * count * count) >> 12;
+ c2 = (cc2 * count * count) >> 12;
+ } else if (bd == 10) {
+ c1 = (cc1_10 * count * count) >> 12;
+ c2 = (cc2_10 * count * count) >> 12;
+ } else if (bd == 12) {
+ c1 = (cc1_12 * count * count) >> 12;
+ c2 = (cc2_12 * count * count) >> 12;
+ } else {
+ assert(0);
+ }
+
+ ssim_n = (2 * sum_s * sum_r + c1) *
+ ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+ ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+ ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+ (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+ return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t bd, uint32_t shift) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+ sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+ int stride_img2, int width, int height) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+ int stride_img1, int stride_img2, int width,
+ int height, uint32_t bd, uint32_t shift) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+ CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+ shift);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+ (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+ // Since these variables are unsigned sums, convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side. check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+ const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+ // Since these variables are unsigned, sums convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, Ssimv *sv) {
+ ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+ &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency) {
+ double dssim_total = 0;
+ double ssim_total = 0;
+ double ssim2_total = 0;
+ double inconsistency_total = 0;
+ int i, j;
+ int c = 0;
+ double norm;
+ double old_ssim_total = 0;
+
+ // We can sample points as frequently as we like start with 1 per 4x4.
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4, ++c) {
+ Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+ double ssim;
+ double ssim2;
+ double dssim;
+ uint32_t var_new;
+ uint32_t var_old;
+ uint32_t mean_new;
+ uint32_t mean_old;
+ double ssim_new;
+ double ssim_old;
+
+ // Not sure there's a great way to handle the edge pixels
+ // in ssim when using a window. Seems biased against edge pixels
+ // however you handle this. This uses only samples that are
+ // fully in the frame.
+ if (j + 8 <= width && i + 8 <= height) {
+ ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+ }
+
+ ssim = ssimv_similarity(&sv, 64);
+ ssim2 = ssimv_similarity2(&sv, 64);
+
+ sv.ssim = ssim2;
+
+ // dssim is calculated to use as an actual error metric and
+ // is scaled up to the same range as sum square error.
+ // Since we are subsampling every 16th point maybe this should be
+ // *16 ?
+ dssim = 255 * 255 * (1 - ssim2) / 2;
+
+ // Here I introduce a new error metric: consistency-weighted
+ // SSIM-inconsistency. This metric isolates frames where the
+ // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+ // sharper or blurrier than the others. Higher values indicate a
+ // temporally inconsistent SSIM. There are two ideas at work:
+ //
+ // 1) 'SSIM-inconsistency': the total inconsistency value
+ // reflects how much SSIM values are changing between this
+ // source / reference frame pair and the previous pair.
+ //
+ // 2) 'consistency-weighted': weights de-emphasize areas in the
+ // frame where the scene content has changed. Changes in scene
+ // content are detected via changes in local variance and local
+ // mean.
+ //
+ // Thus the overall measure reflects how inconsistent the SSIM
+ // values are, over consistent regions of the frame.
+ //
+ // The metric has three terms:
+ //
+ // term 1 -> uses change in scene Variance to weight error score
+ // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term 2 -> uses change in local scene luminance to weight error
+ // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term3 -> measures inconsistency in ssim scores between frames
+ // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+ //
+ // This term compares the ssim score for the same location in 2
+ // subsequent frames.
+ var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+ var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+ mean_new = sv.sum_s;
+ mean_old = sv2[c].sum_s;
+ ssim_new = sv.ssim;
+ ssim_old = sv2[c].ssim;
+
+ if (do_inconsistency) {
+ // We do the metric once for every 4x4 block in the image. Since
+ // we are scaling the error to SSE for use in a psnr calculation
+ // 1.0 = 4x4x255x255 the worst error we can possibly have.
+ static const double kScaling = 4. * 4 * 255 * 255;
+
+ // The constants have to be non 0 to avoid potential divide by 0
+ // issues other than that they affect kind of a weighting between
+ // the terms. No testing of what the right terms should be has been
+ // done.
+ static const double c1 = 1, c2 = 1, c3 = 1;
+
+ // This measures how much consistent variance is in two consecutive
+ // source frames. 1.0 means they have exactly the same variance.
+ const double variance_term =
+ (2.0 * var_old * var_new + c1) /
+ (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+ // This measures how consistent the local mean are between two
+ // consecutive frames. 1.0 means they have exactly the same mean.
+ const double mean_term =
+ (2.0 * mean_old * mean_new + c2) /
+ (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+ // This measures how consistent the ssims of two
+ // consecutive frames is. 1.0 means they are exactly the same.
+ double ssim_term =
+ pow((2.0 * ssim_old * ssim_new + c3) /
+ (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+ 5);
+
+ double this_inconsistency;
+
+ // Floating point math sometimes makes this > 1 by a tiny bit.
+ // We want the metric to scale between 0 and 1.0 so we can convert
+ // it to an snr scaled value.
+ if (ssim_term > 1) ssim_term = 1;
+
+ // This converts the consistency metric to an inconsistency metric
+ // ( so we can scale it like psnr to something like sum square error.
+ // The reason for the variance and mean terms is the assumption that
+ // if there are big changes in the source we shouldn't penalize
+ // inconsistency in ssim scores a bit less as it will be less visible
+ // to the user.
+ this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+ this_inconsistency *= kScaling;
+ inconsistency_total += this_inconsistency;
+ }
+ sv2[c] = sv;
+ ssim_total += ssim;
+ ssim2_total += ssim2;
+ dssim_total += dssim;
+
+ old_ssim_total += ssim_old;
+ }
+ old_ssim_total += 0;
+ }
+
+ norm = 1. / (width / 4) / (height / 4);
+ ssim_total *= norm;
+ ssim2_total *= norm;
+ m->ssim2 = ssim2_total;
+ m->ssim = ssim_total;
+ if (old_ssim_total == 0) inconsistency_total = 0;
+
+ m->ssimc = inconsistency_total;
+
+ m->dssim = dssim_total;
+ return inconsistency_total;
+}
+
+double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd) {
+ double a, b, c;
+ double ssimv;
+ uint32_t shift = 0;
+
+ assert(bd >= in_bd);
+ shift = bd - in_bd;
+
+ a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+ dest->y_stride, source->y_crop_width, source->y_crop_height,
+ in_bd, shift);
+
+ b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
+
+ c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
int main(int argc, char *argv[]) {
FILE *framestats = NULL;
+ int bit_depth = 8;
int w = 0, h = 0, tl_skip = 0, tl_skips_remaining = 0;
double ssimavg = 0, ssimyavg = 0, ssimuavg = 0, ssimvavg = 0;
double psnrglb = 0, psnryglb = 0, psnruglb = 0, psnrvglb = 0;
@@ -200,11 +581,12 @@ int main(int argc, char *argv[]) {
size_t i, n_frames = 0, allocated_frames = 0;
int return_value = 0;
input_file_t in[2];
+ double peak = 255.0;
if (argc < 2) {
fprintf(stderr,
"Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}"
- "[WxH tl_skip={0,1,3}]\n",
+ "[WxH tl_skip={0,1,3} frame_stats_file bits]\n",
argv[0]);
return_value = 1;
goto clean_up;
@@ -214,7 +596,11 @@ int main(int argc, char *argv[]) {
sscanf(argv[3], "%dx%d", &w, &h);
}
- if (open_input_file(argv[1], &in[0], w, h) < 0) {
+ if (argc > 6) {
+ sscanf(argv[6], "%d", &bit_depth);
+ }
+
+ if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) {
fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
goto clean_up;
}
@@ -223,9 +609,13 @@ int main(int argc, char *argv[]) {
// If a y4m is the first file and w, h is not set grab from first file.
w = in[0].w;
h = in[0].h;
+ bit_depth = in[0].bit_depth;
}
+ if (bit_depth == 10) peak = 1023.0;
- if (open_input_file(argv[2], &in[1], w, h) < 0) {
+ if (bit_depth == 12) peak = 4095;
+
+ if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) {
fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
goto clean_up;
}
@@ -264,7 +654,7 @@ int main(int argc, char *argv[]) {
size_t r1, r2;
unsigned char *y[2], *u[2], *v[2];
- r1 = read_input_file(&in[0], &y[0], &u[0], &v[0]);
+ r1 = read_input_file(&in[0], &y[0], &u[0], &v[0], bit_depth);
if (r1) {
// Reading parts of file1.yuv that were not used in temporal layer.
@@ -276,7 +666,7 @@ int main(int argc, char *argv[]) {
tl_skips_remaining = tl_skip;
}
- r2 = read_input_file(&in[1], &y[1], &u[1], &v[1]);
+ r2 = read_input_file(&in[1], &y[1], &u[1], &v[1], bit_depth);
if (r1 && r2 && r1 != r2) {
fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno),
@@ -286,9 +676,22 @@ int main(int argc, char *argv[]) {
} else if (r1 == 0 || r2 == 0) {
break;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
+ if (bit_depth < 9) { \
+ ssim = ssim2(buf0, buf1, w, w, w, h); \
+ psnr = calc_plane_error(buf0, w, buf1, w, w, h); \
+ } else { \
+ ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \
+ w, w, h, bit_depth, bit_depth - 8); \
+ psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w, \
+ CAST_TO_SHORTPTR(buf1), w, w, h); \
+ }
+#else
#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
- ssim = vp8_ssim2(buf0, buf1, w, w, w, h); \
+ ssim = ssim2(buf0, buf1, w, w, w, h); \
psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+#endif
if (n_frames == allocated_frames) {
allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2;
@@ -321,11 +724,11 @@ int main(int argc, char *argv[]) {
ssimuavg += ssimu[i];
ssimvavg += ssimv[i];
- frame_psnr = vp9_mse2psnr(w * h * 6 / 4, 255.0,
- (double)psnry[i] + psnru[i] + psnrv[i]);
- frame_psnry = vp9_mse2psnr(w * h * 4 / 4, 255.0, (double)psnry[i]);
- frame_psnru = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnru[i]);
- frame_psnrv = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnrv[i]);
+ frame_psnr =
+ mse2psnr(w * h * 6 / 4, peak, (double)psnry[i] + psnru[i] + psnrv[i]);
+ frame_psnry = mse2psnr(w * h * 4 / 4, peak, (double)psnry[i]);
+ frame_psnru = mse2psnr(w * h * 1 / 4, peak, (double)psnru[i]);
+ frame_psnrv = mse2psnr(w * h * 1 / 4, peak, (double)psnrv[i]);
psnravg += frame_psnr;
psnryavg += frame_psnry;
@@ -367,10 +770,10 @@ int main(int argc, char *argv[]) {
puts("");
psnrglb = psnryglb + psnruglb + psnrvglb;
- psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
- psnryglb = vp9_mse2psnr((double)n_frames * w * h * 4 / 4, 255.0, psnryglb);
- psnruglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnruglb);
- psnrvglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnrvglb);
+ psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, peak, psnrglb);
+ psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, peak, psnryglb);
+ psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnruglb);
+ psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnrvglb);
printf("GlbPSNR: %lf\n", psnrglb);
printf("GlbPSNR-Y: %lf\n", psnryglb);
diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h
index 74fc5d6db..1a3aad16a 100644
--- a/libvpx/vp8/common/blockd.h
+++ b/libvpx/vp8/common/blockd.h
@@ -169,6 +169,11 @@ typedef struct {
typedef struct {
FRAME_TYPE frame_type;
int is_frame_dropped;
+ // If frame is dropped due to overshoot after encode_frame. This triggers a
+ // drop and resets rate control with Q forced to max for following frame.
+ // The check for this dropping due to overshoot is only done on lowest stream,
+ // and if set will force drop on all spatial streams for that current frame.
+ int is_frame_dropped_overshoot_maxqp;
// The frame rate for the lowest resolution.
double low_res_framerate;
/* The frame number of each reference frames */
diff --git a/libvpx/vp8/common/loopfilter_filters.c b/libvpx/vp8/common/loopfilter_filters.c
index 2a7cde878..188e290ca 100644
--- a/libvpx/vp8/common/loopfilter_filters.c
+++ b/libvpx/vp8/common/loopfilter_filters.c
@@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
u = vp8_signed_char_clamp(ps1 + filter_value);
*op1 = u ^ 0x80;
}
-void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh, int count) {
+
+static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
@@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
} while (++i < count * 8);
}
-void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh, int count) {
+static void loop_filter_vertical_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
@@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0,
*op2 = s ^ 0x80;
}
-void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- int count) {
+static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
@@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
} while (++i < count * 8);
}
-void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh, int count) {
+static void mbloop_filter_vertical_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
@@ -295,17 +299,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p,
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
+ mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
if (u_ptr) {
- vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
}
@@ -313,17 +317,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
+ mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
if (u_ptr) {
- vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
}
@@ -331,21 +335,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
if (u_ptr) {
- vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
+ loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
+ loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 1);
}
}
@@ -363,21 +367,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
if (u_ptr) {
- vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 1);
+ loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 1);
+ loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
}
}
diff --git a/libvpx/vp8/common/mfqe.c b/libvpx/vp8/common/mfqe.c
index 5aace8c99..b6f8146b8 100644
--- a/libvpx/vp8/common/mfqe.c
+++ b/libvpx/vp8/common/mfqe.c
@@ -74,8 +74,7 @@ static void apply_ifactor(unsigned char *y_src, int y_src_stride,
src_weight);
vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride,
src_weight);
- } else /* if (block_size == 8) */
- {
+ } else {
vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride,
src_weight);
vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride,
@@ -136,8 +135,7 @@ static void multiframe_quality_enhance_block(
usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6;
#endif
- } else /* if (blksize == 8) */
- {
+ } else {
actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
#ifdef USE_SSD
@@ -186,14 +184,12 @@ static void multiframe_quality_enhance_block(
apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
uvd_stride, blksize, ifactor);
}
- } else /* else implicitly copy from previous frame */
- {
+ } else { /* else implicitly copy from previous frame */
if (blksize == 16) {
vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
- } else /* if (blksize == 8) */
- {
+ } else {
vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
for (up = u, udp = ud, i = 0; i < uvblksize;
++i, up += uv_stride, udp += uvd_stride) {
@@ -297,8 +293,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
}
}
}
- } else /* totmap = 4 */
- {
+ } else { /* totmap = 4 */
multiframe_quality_enhance_block(
16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride,
show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride,
diff --git a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
index 2de343419..e46827b0e 100644
--- a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -673,9 +673,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
: [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
[n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+ [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1)
: [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
- [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2),
+ [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2),
[vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
[src_ptr] "r"(src_ptr));
@@ -724,9 +724,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
: [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
[n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+ [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4)
: [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
- [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1),
+ [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1),
[src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
[vector3b] "r"(vector3b));
@@ -781,9 +781,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
: [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
[Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [Temp4] "=r"(Temp4)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
- [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
+ [Temp4] "=r"(Temp4), [tp1] "+r"(tp1)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4),
+ [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
[vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
[src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
diff --git a/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/libvpx/vp8/common/mips/mmi/copymem_mmi.c
new file mode 100644
index 000000000..86a32aa9e
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/copymem_mmi.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define COPY_MEM_16X2 \
+ "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \
+ "ldl %[tmp0], 0x0f(%[src]) \n\t" \
+ "ldr %[tmp0], 0x08(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \
+ "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \
+ "sdl %[tmp0], 0x0f(%[dst]) \n\t" \
+ "sdr %[tmp0], 0x08(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride]) \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "ldl %[tmp1], 0x0f(%[src]) \n\t" \
+ "ldr %[tmp1], 0x08(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ "gssdlc1 %[ftmp1], 0x07(%[dst]) \n\t" \
+ "gssdrc1 %[ftmp1], 0x00(%[dst]) \n\t" \
+ "sdl %[tmp1], 0x0f(%[dst]) \n\t" \
+ "sdr %[tmp1], 0x08(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+
+#define COPY_MEM_8X2 \
+ "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ "ldl %[tmp0], 0x07(%[src]) \n\t" \
+ "ldr %[tmp0], 0x00(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ \
+ "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \
+ "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride]) \
+ "sdl %[tmp0], 0x07(%[dst]) \n\t" \
+ "sdr %[tmp0], 0x00(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+
+void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ double ftmp[2];
+ uint64_t tmp[2];
+ uint8_t loop_count = 4;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "1: \n\t"
+ COPY_MEM_16X2
+ COPY_MEM_16X2
+ MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+ "bnez %[loop_count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [loop_count]"+&r"(loop_count),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+ int dst_stride) {
+ double ftmp[2];
+ uint64_t tmp[1];
+ uint8_t loop_count = 4;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "1: \n\t"
+ COPY_MEM_8X2
+ MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+ "bnez %[loop_count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [tmp0]"=&r"(tmp[0]), [loop_count]"+&r"(loop_count),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+ int dst_stride) {
+ double ftmp[2];
+ uint64_t tmp[1];
+
+ /* clang-format off */
+ __asm__ volatile (
+ COPY_MEM_8X2
+ COPY_MEM_8X2
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [tmp0]"=&r"(tmp[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ /* clang-format on */
+}
diff --git a/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
new file mode 100644
index 000000000..b3f8084ae
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) {
+ double ftmp[8];
+
+ __asm__ volatile(
+ "gsldlc1 %[ftmp0], 0x07(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[qcoeff]) \n\t"
+ "gsldlc1 %[ftmp1], 0x0f(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[qcoeff]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[qcoeff]) \n\t"
+ "gsldlc1 %[ftmp3], 0x1f(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp3], 0x18(%[qcoeff]) \n\t"
+
+ "gsldlc1 %[ftmp4], 0x07(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[DQC]) \n\t"
+ "gsldlc1 %[ftmp5], 0x0f(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp5], 0x08(%[DQC]) \n\t"
+ "gsldlc1 %[ftmp6], 0x17(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp6], 0x10(%[DQC]) \n\t"
+ "gsldlc1 %[ftmp7], 0x1f(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp7], 0x18(%[DQC]) \n\t"
+
+ "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+
+ "gssdlc1 %[ftmp0], 0x07(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[dqcoeff]) \n\t"
+ "gssdlc1 %[ftmp1], 0x0f(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp1], 0x08(%[dqcoeff]) \n\t"
+ "gssdlc1 %[ftmp2], 0x17(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp2], 0x10(%[dqcoeff]) \n\t"
+ "gssdlc1 %[ftmp3], 0x1f(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp3], 0x18(%[dqcoeff]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+ : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC)
+ : "memory");
+}
+
+void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest,
+ int stride) {
+ double ftmp[8];
+
+ __asm__ volatile(
+ "gsldlc1 %[ftmp0], 0x07(%[dq]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[dq]) \n\t"
+ "gsldlc1 %[ftmp1], 0x0f(%[dq]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[dq]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[dq]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[dq]) \n\t"
+ "gsldlc1 %[ftmp3], 0x1f(%[dq]) \n\t"
+ "gsldrc1 %[ftmp3], 0x18(%[dq]) \n\t"
+
+ "gsldlc1 %[ftmp4], 0x07(%[input]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[input]) \n\t"
+ "gsldlc1 %[ftmp5], 0x0f(%[input]) \n\t"
+ "gsldrc1 %[ftmp5], 0x08(%[input]) \n\t"
+ "gsldlc1 %[ftmp6], 0x17(%[input]) \n\t"
+ "gsldrc1 %[ftmp6], 0x10(%[input]) \n\t"
+ "gsldlc1 %[ftmp7], 0x1f(%[input]) \n\t"
+ "gsldrc1 %[ftmp7], 0x18(%[input]) \n\t"
+
+ "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+
+ "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t"
+ "gssdlc1 %[ftmp1], 0x0f(%[input]) \n\t"
+ "gssdrc1 %[ftmp1], 0x08(%[input]) \n\t"
+ "gssdlc1 %[ftmp2], 0x17(%[input]) \n\t"
+ "gssdrc1 %[ftmp2], 0x10(%[input]) \n\t"
+ "gssdlc1 %[ftmp3], 0x1f(%[input]) \n\t"
+ "gssdrc1 %[ftmp3], 0x18(%[input]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+ : [dq] "r"(dq), [input] "r"(input)
+ : "memory");
+
+ vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride);
+
+ __asm__ volatile(
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t"
+ "sdl $0, 0x0f(%[input]) \n\t"
+ "sdr $0, 0x08(%[input]) \n\t"
+ "gssdlc1 %[ftmp0], 0x17(%[input]) \n\t"
+ "gssdrc1 %[ftmp0], 0x10(%[input]) \n\t"
+ "sdl $0, 0x1f(%[input]) \n\t"
+ "sdr $0, 0x18(%[input]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0])
+ : [input] "r"(input)
+ : "memory");
+}
diff --git a/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
new file mode 100644
index 000000000..f6020ab46
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
+ int stride, int8_t *eobs) {
+ int i, j;
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dst, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
+ uint8_t *dstv, int stride,
+ int8_t *eobs) {
+ int i, j;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dstu, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dstu += 4;
+ }
+
+ dstu += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dstv, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dstv += 4;
+ }
+
+ dstv += 4 * stride - 8;
+ }
+}
diff --git a/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
new file mode 100644
index 000000000..5e48f5916
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define TRANSPOSE_4H \
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
+ MMI_LI(%[tmp0], 0x93) \
+ "mtc1 %[tmp0], %[ftmp10] \n\t" \
+ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
+ "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
+ "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
+ "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ double ftmp[12];
+ uint32_t tmp[0];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
+
+ __asm__ volatile (
+ MMI_LI(%[tmp0], 0x02)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
+ "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
+ "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
+
+ // ip[0...3] + ip[8...11]
+ "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ // ip[0...3] - ip[8...11]
+ "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
+ // (ip[12...15] * sinpi8sqrt2) >> 16
+ "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
+ "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
+ // (ip[ 4... 7] * sinpi8sqrt2) >> 16
+ "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
+ "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
+ // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
+ "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
+ "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
+ // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
+ "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
+ "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
+ "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
+ "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
+
+ TRANSPOSE_4H
+ // a
+ "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ // b
+ "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
+ // c
+ "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
+ "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
+ "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
+ "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
+ // d
+ "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
+ "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
+ "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
+ "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
+ "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
+
+ MMI_LI(%[tmp0], 0x03)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ // a + d
+ "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ // b + c
+ "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t"
+ "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ // b - c
+ "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t"
+ "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ // a - d
+ "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t"
+ "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ TRANSPOSE_4H
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp5] \n\t"
+#else
+ "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp6] \n\t"
+#else
+ "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t"
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+#else
+ "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t"
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp8] \n\t"
+#else
+ "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
+ "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+ "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+ [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+ [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+ [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
+ [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
+ : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
+ [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
+ [pred_stride]"r"((mips_reg)pred_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+}
+
+void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int a1 = ((input_dc + 4) >> 3);
+ double ftmp[5];
+ int low32;
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pshufh %[a1], %[a1], %[ftmp0] \n\t"
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+ [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
+ [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
+ : [dst_stride]"r"((mips_reg)dst_stride),
+ [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
+ : "memory"
+ );
+}
+
+void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
+ int i;
+ int16_t output[16];
+ double ftmp[12];
+ uint32_t tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
+
+ __asm__ volatile (
+ MMI_LI(%[tmp0], 0x03)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
+ "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
+ "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
+ "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
+ "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
+ "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
+ "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
+ "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+ TRANSPOSE_4H
+ // a
+ "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
+ // d
+ "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t"
+ // b
+ "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
+ // c
+ "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
+ "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t"
+ "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t"
+ "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t"
+ "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ TRANSPOSE_4H
+ "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
+ "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
+ "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
+ "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
+ "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+ [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+ [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+ [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
+ : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
+ : "memory"
+ );
+
+ for (i = 0; i < 16; i++) {
+ mb_dqcoeff[i * 16] = output[i];
+ }
+}
diff --git a/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
new file mode 100644
index 000000000..f2182f95c
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -0,0 +1,1337 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+ ff_ph_003f) = { 0x003f003f003f003fULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+ ff_ph_0900) = { 0x0900090009000900ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+ ff_ph_1200) = { 0x1200120012001200ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+ ff_ph_1b00) = { 0x1b001b001b001b00ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };
+
+void vp8_loop_filter_horizontal_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ uint32_t tmp[1];
+ mips_reg addr[2];
+ double ftmp[12];
+ __asm__ volatile (
+ "1: \n\t"
+ "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
+
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
+ "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t"
+ "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
+ "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+
+ "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t"
+ "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
+ "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+
+ "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t"
+ "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t"
+
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t"
+
+ "li %[tmp0], 0x0b \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
+ "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t"
+ "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
+ "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t"
+ "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t"
+
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
+ "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
+ "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
+ "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
+
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+
+ "addiu %[count], %[count], -0x01 \n\t"
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+ [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+ [ff_ph_01]"f"(ff_ph_01), [ff_pb_fe]"f"(ff_pb_fe),
+ [ff_pb_80]"f"(ff_pb_80), [ff_pb_04]"f"(ff_pb_04),
+ [ff_pb_03]"f"(ff_pb_03)
+ : "memory"
+ );
+}
+
+void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh, int count) {
+ uint32_t tmp[1];
+ mips_reg addr[2];
+ double ftmp[13];
+
+ __asm__ volatile (
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+ "1: \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+ MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
+
+ "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
+ "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t"
+
+ "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+
+ "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t"
+
+ /* ftmp9:q0 ftmp10:q1 */
+ "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
+ "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
+ /* ftmp11:q2 ftmp12:q3 */
+ "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
+ /* ftmp1:p3 ftmp2:p2 */
+ "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
+ "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
+ /* ftmp5:p1 ftmp6:p0 */
+ "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t"
+
+ /* abs (q3-q2) */
+ "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
+ /* abs (q2-q1) */
+ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp3: abs(q1-q0) */
+ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp4: abs(p1-p0) */
+ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p2-p1) */
+ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p3-p2) */
+ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t"
+
+ /* abs (p0-q0) */
+ "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t"
+ "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ /* abs (p1-q1) */
+ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
+ "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp1] \n\t"
+ "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t"
+ "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "xor %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp0:mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t"
+
+ /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */
+ "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+ "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
+ "or %[ftmp2], %[ftmp4], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp1:hev */
+ "xor %[ftmp1], %[ftmp2], %[ftmp1] \n\t"
+
+ "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t"
+ "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ /* ftmp2:filter_value */
+ "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t"
+ "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t"
+
+ "li %[tmp0], 0x0b \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t"
+
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t"
+
+ "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t"
+
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t"
+ "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t"
+ "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ /* ftmp5: *op1 ; ftmp6: *op0 */
+ "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ /* ftmp9: *oq0 ; ftmp10: *oq1 */
+ "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
+ "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
+
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+ "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
+
+ "dsrl %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
+ "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t"
+
+ "dsrl %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t"
+ "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+ "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
+
+ "dsrl %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+ "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
+
+ MMI_ADDIU(%[count], %[count], -0x01)
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [ff_ph_01]"f"(ff_ph_01), [ff_pb_03]"f"(ff_pb_03),
+ [ff_pb_04]"f"(ff_pb_04), [ff_pb_80]"f"(ff_pb_80),
+ [ff_pb_fe]"f"(ff_pb_fe)
+ : "memory"
+ );
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_HPSRAB \
+ "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \
+ "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \
+ "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \
+ "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg) \
+ "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \
+ "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \
+ "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \
+ "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \
+ "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
+ "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \
+ "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_horizontal_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ uint32_t tmp[1];
+ double ftmp[13];
+
+ __asm__ volatile (
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ "1: \n\t"
+ "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t"
+ /* ftmp1: p3 */
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ /* ftmp3: p2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+ /* ftmp4: p1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+ /* ftmp5: p0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ /* ftmp6: q0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ /* ftmp7: q1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ /* ftmp8: q2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ /* ftmp2: q3 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t"
+
+ "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
+ "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
+ "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ /* ftmp0: mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+
+ "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t"
+ "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ /* ftmp1: hev */
+ "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+
+ "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
+ "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+
+ "li %[tmp0], 0x0b \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t"
+ VP8_MBLOOP_HPSRAB
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t"
+ VP8_MBLOOP_HPSRAB
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+
+ "li %[tmp0], 0x07 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
+ "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
+ "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
+ "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+ "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "addiu %[count], %[count], -0x01 \n\t"
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
+ [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03),
+ [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00),
+ [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f)
+ : "memory"
+ );
+}
+
+#define VP8_MBLOOP_VPSRAB_ADDH \
+ "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
+
+#define VP8_MBLOOP_VPSRAB_ADDT \
+ "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \
+ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
+
+void vp8_mbloop_filter_vertical_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ mips_reg tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+ double ftmp[14];
+
+ __asm__ volatile (
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
+
+ "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
+
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t"
+
+ "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t"
+ /* ftmp9:q0 ftmp10:q1 */
+ "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
+ "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
+ /* ftmp11:q2 ftmp12:q3 */
+ "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
+ /* srct[0x00]: q3 */
+ "sdc1 %[ftmp12], 0x00(%[srct]) \n\t"
+ /* ftmp1:p3 ftmp2:p2 */
+ "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
+ "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
+ /* srct[0x08]: p3 */
+ "sdc1 %[ftmp1], 0x08(%[srct]) \n\t"
+ /* ftmp5:p1 ftmp6:p0 */
+ "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
+
+ /* abs (q3-q2) */
+ "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t"
+ /* abs (q2-q1) */
+ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp3: abs(q1-q0) */
+ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp4: abs(p1-p0) */
+ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p2-p1) */
+ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p3-p2) */
+ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+
+ "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t"
+ /* abs (p0-q0) * 2 */
+ "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* abs (p1-q1) / 2 */
+ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
+ "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp8] \n\t"
+ "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
+ "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t"
+ "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ /* ftmp0: mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+
+ /* abs(p1-p0) - thresh */
+ "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
+ /* abs(q1-q0) - thresh */
+ "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "or %[ftmp3], %[ftmp4], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp1: hev */
+ "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+
+ /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
+ "xor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t"
+ "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ /* filter_value &= mask */
+ "and %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ /* Filter2 = filter_value & hev */
+ "and %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ /* filter_value &= ~hev */
+ "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t"
+ "li %[tmp0], 0x0b \n\t"
+ "mtc1 %[tmp0], %[ftmp12] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t"
+ /* ftmp9: qs0 */
+ "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t"
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
+ /* ftmp6: ps0 */
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
+
+ "li %[tmp0], 0x07 \n\t"
+ "mtc1 %[tmp0], %[ftmp12] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDH
+ "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t"
+ /* ftmp9: oq0 */
+ "xor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t"
+ /* ftmp6: op0 */
+ "xor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t"
+
+ VP8_MBLOOP_VPSRAB_ADDH
+ "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t"
+ /* ftmp10: oq1 */
+ "xor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
+ /* ftmp5: op1 */
+ "xor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t"
+
+ VP8_MBLOOP_VPSRAB_ADDH
+ "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t"
+ /* ftmp11: oq2 */
+ "xor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t"
+ /* ftmp2: op2 */
+ "xor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t"
+
+ "ldc1 %[ftmp12], 0x00(%[srct]) \n\t"
+ "ldc1 %[ftmp8], 0x08(%[srct]) \n\t"
+
+ "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
+ "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
+
+ "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
+ "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t"
+ "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t"
+
+ "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t"
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t"
+ "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "addiu %[count], %[count], -0x01 \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
+ [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr),
+ [count]"+&r"(count)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [srct]"r"(srct), [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [ff_ph_003f]"f"(ff_ph_003f), [ff_ph_0900]"f"(ff_ph_0900),
+ [ff_pb_03]"f"(ff_pb_03), [ff_pb_04]"f"(ff_pb_04),
+ [ff_pb_80]"f"(ff_pb_80), [ff_pb_fe]"f"(ff_pb_fe)
+ : "memory"
+ );
+}
+
+#define VP8_SIMPLE_HPSRAB \
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \
+ "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \
+ "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \
+ "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit) {
+ uint32_t tmp[1], count = 2;
+ mips_reg addr[2];
+ double ftmp[12];
+
+ __asm__ volatile (
+ "li %[tmp0], 0x08 \n\t"
+ "mtc1 %[tmp0], %[ftmp8] \n\t"
+ "li %[tmp0], 0x03 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "li %[tmp0], 0x0b \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t"
+ "and %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
+
+ "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "xor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "and %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+ "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
+ VP8_SIMPLE_HPSRAB
+ "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+
+ "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
+ VP8_SIMPLE_HPSRAB
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ "addiu %[count], %[count], -0x01 \n\t"
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ : [blimit]"r"(blimit),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+ [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
+ [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01)
+ : "memory"
+ );
+}
+
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit) {
+ uint32_t tmp[1], count = 2;
+ mips_reg addr[2];
+ DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+ double ftmp[12];
+
+ __asm__ volatile (
+ "li %[tmp0], 0x08 \n\t"
+ "mtc1 %[tmp0], %[ftmp8] \n\t"
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
+
+ "1: \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
+
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
+ "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t"
+ "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
+
+ "li %[tmp0], 0x01 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
+ "and %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t"
+ "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t"
+ "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+
+ "sdc1 %[ftmp0], 0x00(%[srct]) \n\t"
+ "sdc1 %[ftmp3], 0x08(%[srct]) \n\t"
+
+ "xor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t"
+ "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+
+ "xor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t"
+ "xor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "and %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
+
+ "li %[tmp0], 0x03 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+
+ "li %[tmp0], 0x0b \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t"
+ "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
+
+ "li %[tmp0], 0x03 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+
+ "li %[tmp0], 0x0b \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
+ "or %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+
+ "ldc1 %[ftmp0], 0x00(%[srct]) \n\t"
+ "ldc1 %[ftmp4], 0x08(%[srct]) \n\t"
+
+ "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+
+ "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+
+ "dsrl %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ "dsrl %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+
+ "dsrl %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
+
+ "dsrl %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
+ "addiu %[count], %[count], -0x01 \n\t"
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ : [blimit]"r"(blimit), [srct]"r"(srct),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+ [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+ [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),
+ [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
+ [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01)
+ : "memory"
+ );
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+ blimit);
+}
+
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
+}
diff --git a/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
new file mode 100644
index 000000000..77d665d45
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+ 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
+ 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+ 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+ 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+ 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+ 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+ 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+ 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
+ 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+ 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+ 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+ 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+ 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+ 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+ 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
+ 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+ 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+ 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+ 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+ 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+ 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
+};
+
+/* Horizontal filter: pixel_step is 1, output_height and output_width are
+ the size of horizontal filtering output, output_height is always H + 5 */
+static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp8_filter) {
+ uint32_t tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+ register double ftmp2 asm("$f6");
+ register double ftmp3 asm("$f8");
+ register double ftmp4 asm("$f10");
+ register double ftmp5 asm("$f12");
+ register double ftmp6 asm("$f14");
+ register double ftmp7 asm("$f16");
+ register double ftmp8 asm("$f18");
+ register double ftmp9 asm("$f20");
+ register double ftmp10 asm("$f22");
+ register double ftmp11 asm("$f24");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+ register double ftmp2 asm("$f3");
+ register double ftmp3 asm("$f4");
+ register double ftmp4 asm("$f5");
+ register double ftmp5 asm("$f6");
+ register double ftmp6 asm("$f7");
+ register double ftmp7 asm("$f8");
+ register double ftmp8 asm("$f9");
+ register double ftmp9 asm("$f10");
+ register double ftmp10 asm("$f11");
+ register double ftmp11 asm("$f12");
+#endif // _MIPS_SIM == _ABIO32
+
+ __asm__ volatile (
+ "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
+ "xor %[fzero], %[fzero], %[fzero] \n\t"
+ "li %[tmp0], 0x07 \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "li %[tmp0], 0x08 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t"
+ "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t"
+
+ "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t"
+ "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t"
+ "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t"
+
+ "addiu %[output_height], %[output_height], -0x01 \n\t"
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width])
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2),
+ [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4),
+ [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6),
+ [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8),
+ [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10),
+ [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height),
+ [src_ptr]"+&r"(src_ptr)
+ : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+ [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width),
+ [ff_ph_40]"f"(ff_ph_40)
+ : "memory"
+ );
+}
+
+/* Horizontal filter: pixel_step is always W */
+static INLINE void vp8_filter_block1dc_v6_mmi(
+ uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+ int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+ uint32_t tmp[1];
+ mips_reg addr[1];
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+ register double ftmp2 asm("$f6");
+ register double ftmp3 asm("$f8");
+ register double ftmp4 asm("$f10");
+ register double ftmp5 asm("$f12");
+ register double ftmp6 asm("$f14");
+ register double ftmp7 asm("$f16");
+ register double ftmp8 asm("$f18");
+ register double ftmp9 asm("$f20");
+ register double ftmp10 asm("$f22");
+ register double ftmp11 asm("$f24");
+ register double ftmp12 asm("$f26");
+ register double ftmp13 asm("$f28");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+ register double ftmp2 asm("$f3");
+ register double ftmp3 asm("$f4");
+ register double ftmp4 asm("$f5");
+ register double ftmp5 asm("$f6");
+ register double ftmp6 asm("$f7");
+ register double ftmp7 asm("$f8");
+ register double ftmp8 asm("$f9");
+ register double ftmp9 asm("$f10");
+ register double ftmp10 asm("$f11");
+ register double ftmp11 asm("$f12");
+ register double ftmp12 asm("$f13");
+ register double ftmp13 asm("$f14");
+#endif // _MIPS_SIM == _ABIO32
+
+ __asm__ volatile (
+ "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
+ "xor %[fzero], %[fzero], %[fzero] \n\t"
+ "li %[tmp0], 0x07 \n\t"
+ "mtc1 %[tmp0], %[ftmp13] \n\t"
+
+ /* In order to make full use of memory load delay slot,
+ * Operation of memory loading and calculating has been rearranged.
+ */
+ "1: \n\t"
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line])
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2])
+ "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4])
+ "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line])
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2])
+ "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4])
+ "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t"
+
+ "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
+
+ "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t"
+
+ "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
+
+ "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t"
+
+ "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
+
+ "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t"
+
+ "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t"
+ "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
+ "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t"
+ "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t"
+ "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t"
+
+ MMI_ADDIU(%[output_height], %[output_height], -0x01)
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2),
+ [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4),
+ [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6),
+ [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8),
+ [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10),
+ [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12),
+ [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
+ : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+ [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
+ [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
+ [vp8_filter]"r"(vp8_filter),
+ [output_pitch]"r"((mips_reg)output_pitch),
+ [ff_ph_40]"f"(ff_ph_40)
+ : "memory"
+ );
+}
+
+/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
+ function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
+ be simplified */
+static INLINE void vp8_filter_block1d_h6_filter0_mmi(
+ unsigned char *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int output_height,
+ unsigned int output_width) {
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+#endif // _MIPS_SIM == _ABIO32
+
+ __asm__ volatile (
+ "xor %[fzero], %[fzero], %[fzero] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line])
+
+ "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t"
+ "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t"
+
+ "addiu %[output_height], %[output_height], -0x01 \n\t"
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
+ : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+ [output_width]"r"(output_width)
+ : "memory"
+ );
+}
+
+static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
+ uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+ int output_pitch, unsigned int pixels_per_line) {
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+#endif // _MIPS_SIM == _ABIO32
+
+ __asm__ volatile (
+ "xor %[fzero], %[fzero], %[fzero] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line])
+ MMI_ADDIU(%[output_height], %[output_height], -0x01)
+ "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t"
+
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
+ : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+ [output_pitch]"r"((mips_reg)output_pitch)
+ : "memory"
+ );
+}
+
+#define sixtapNxM(n, m) \
+ void vp8_sixtap_predict##n##x##m##_mmi( \
+ unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \
+ int yoffset, unsigned char *dst_ptr, int dst_pitch) { \
+ DECLARE_ALIGNED(16, uint16_t, \
+ FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \
+ const int16_t *HFilter, *VFilter; \
+ int i, loop = n / 4; \
+ HFilter = vp8_six_tap_mmi[xoffset]; \
+ VFilter = vp8_six_tap_mmi[yoffset]; \
+ \
+ if (xoffset == 0) { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1d_h6_filter0_mmi( \
+ src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \
+ src_pixels_per_line, m + 5, n * 2); \
+ } \
+ } else { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
+ FData2 + i * 4, src_pixels_per_line, m + 5, \
+ n * 2, HFilter); \
+ } \
+ } \
+ if (yoffset == 0) { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1dc_v6_filter0_mmi( \
+ FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \
+ } \
+ } else { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \
+ dst_pitch, n * 2, VFilter); \
+ } \
+ } \
+ }
+
+sixtapNxM(4, 4);
+sixtapNxM(8, 8);
+sixtapNxM(8, 4);
+sixtapNxM(16, 16);
diff --git a/libvpx/vp8/common/onyxd.h b/libvpx/vp8/common/onyxd.h
index cc2cb8089..d3c1b0e97 100644
--- a/libvpx/vp8/common/onyxd.h
+++ b/libvpx/vp8/common/onyxd.h
@@ -22,6 +22,7 @@ extern "C" {
#include "vpx/vp8.h"
struct VP8D_COMP;
+struct VP8Common;
typedef struct {
int Width;
@@ -45,6 +46,7 @@ int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size,
int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd,
int64_t *time_stamp, int64_t *time_end_stamp,
vp8_ppflags_t *flags);
+int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
enum vpx_ref_frame_type ref_frame_flag,
diff --git a/libvpx/vp8/common/reconintra.c b/libvpx/vp8/common/reconintra.c
index 986074ec7..8e2094da8 100644
--- a/libvpx/vp8/common/reconintra.c
+++ b/libvpx/vp8/common/reconintra.c
@@ -71,8 +71,16 @@ void vp8_build_intra_predictors_mbuv_s(
unsigned char *uleft, unsigned char *vleft, int left_stride,
unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) {
MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
+#if HAVE_VSX
+ /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+ uleft_col and vleft_col. Play it safe by reserving enough stack
+ space here. */
+ unsigned char uleft_col[16];
+ unsigned char vleft_col[16];
+#else
unsigned char uleft_col[8];
unsigned char vleft_col[8];
+#endif
int i;
intra_pred_fn fn;
diff --git a/libvpx/vp8/common/reconintra4x4.c b/libvpx/vp8/common/reconintra4x4.c
index 7852cf9da..64d33a287 100644
--- a/libvpx/vp8/common/reconintra4x4.c
+++ b/libvpx/vp8/common/reconintra4x4.c
@@ -40,7 +40,15 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
int left_stride, B_PREDICTION_MODE b_mode,
unsigned char *dst, int dst_stride,
unsigned char top_left) {
- unsigned char Aboveb[12], *Above = Aboveb + 4;
+/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+ Above (aka, Aboveb + 4). Play it safe by reserving enough stack
+ space here. Similary for "Left". */
+#if HAVE_VSX
+ unsigned char Aboveb[20];
+#else
+ unsigned char Aboveb[12];
+#endif
+ unsigned char *Above = Aboveb + 4;
#if HAVE_NEON
// Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it
// over reads but does not use the extra 4 values.
@@ -50,6 +58,8 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
// indeed read, they are not used.
vp8_zero_array(Left, 8);
#endif // VPX_WITH_ASAN
+#elif HAVE_VSX
+ unsigned char Left[16];
#else
unsigned char Left[4];
#endif // HAVE_NEON
diff --git a/libvpx/vp8/common/rtcd_defs.pl b/libvpx/vp8/common/rtcd_defs.pl
index bc5e05799..3df745f75 100644
--- a/libvpx/vp8/common/rtcd_defs.pl
+++ b/libvpx/vp8/common/rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
sub vp8_common_forward_decls() {
print <<EOF
/*
@@ -22,67 +32,71 @@ forward_decls qw/vp8_common_forward_decls/;
# Dequant
#
add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx neon msa/;
+specialize qw/vp8_dequantize_b mmx neon msa mmi/;
add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
-specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/;
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/;
#
# Loopfilter
#
add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/;
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
+$vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi;
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/;
$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
+$vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi;
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/;
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
+$vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi;
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/;
$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
+$vp8_loop_filter_simple_bh_mmi=vp8_loop_filter_bhs_mmi;
#
# IDCT
#
#idct16
add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
-specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/;
+specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/;
#iwalsh1
add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
@@ -90,23 +104,23 @@ specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
#iwalsh16
add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
-specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa/;
+specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/;
#idct1_scalar_add
add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/;
#
# RECON
#
add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa/;
+specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/;
add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
+specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/;
add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/;
+specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
#
# Postproc
@@ -132,16 +146,16 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
# Subpixel
#
add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/;
add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/;
add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/;
add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
@@ -172,22 +186,22 @@ if ($opts{arch} =~ /x86/) {
# Forward DCT
#
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
+specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
#
# Quantizer
#
add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
#
# Block subtraction
diff --git a/libvpx/vp8/common/threading.h b/libvpx/vp8/common/threading.h
index ece64f3fb..b082bf109 100644
--- a/libvpx/vp8/common/threading.h
+++ b/libvpx/vp8/common/threading.h
@@ -191,47 +191,18 @@ static inline int sem_destroy(sem_t *sem) {
#define x86_pause_hint()
#endif
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define USE_MUTEX_LOCK 1
-#endif
-#endif
-
#include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_atomics.h"
-static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
- (void)mutex;
-#if defined(USE_MUTEX_LOCK)
- int ret;
- pthread_mutex_lock(mutex);
- ret = *p;
- pthread_mutex_unlock(mutex);
- return ret;
-#endif
- return *p;
-}
-
-static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
- const int *last_row_current_mb_col,
- const int nsync) {
- while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+static INLINE void vp8_atomic_spin_wait(
+ int mb_col, const vpx_atomic_int *last_row_current_mb_col,
+ const int nsync) {
+ while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) {
x86_pause_hint();
thread_sleep(0);
}
}
-static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
- (void)mutex;
-#if defined(USE_MUTEX_LOCK)
- pthread_mutex_lock(mutex);
- *p = v;
- pthread_mutex_unlock(mutex);
- return;
-#endif
- *p = v;
-}
-
-#undef USE_MUTEX_LOCK
#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
#ifdef __cplusplus
diff --git a/libvpx/vp8/common/vp8_loopfilter.c b/libvpx/vp8/common/vp8_loopfilter.c
index c6430be46..9fb125065 100644
--- a/libvpx/vp8/common/vp8_loopfilter.c
+++ b/libvpx/vp8/common/vp8_loopfilter.c
@@ -111,11 +111,9 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
/* Note the baseline filter values for each segment */
if (mbd->segmentation_enabled) {
- /* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
- } else /* Delta Value */
- {
+ } else { /* Delta Value */
lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
}
lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
@@ -344,8 +342,7 @@ void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) {
mode_info_context++; /* Skip border mb */
}
- } else /* SIMPLE_LOOPFILTER */
- {
+ } else { /* SIMPLE_LOOPFILTER */
for (mb_row = 0; mb_row < mb_rows; ++mb_row) {
for (mb_col = 0; mb_col < mb_cols; ++mb_col) {
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
diff --git a/libvpx/vp8/common/vp8_skin_detection.c b/libvpx/vp8/common/vp8_skin_detection.c
new file mode 100644
index 000000000..6739efa5f
--- /dev/null
+++ b/libvpx/vp8/common/vp8_skin_detection.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/vp8_skin_detection.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+static int avg_2x2(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 2; ++i, s += p) {
+ for (j = 0; j < 2; ++j) {
+ sum += s[j];
+ }
+ }
+ return (sum + 2) >> 2;
+}
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv,
+ SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+ int curr_motion_magn) {
+ // No skin if block has been zero/small motion for long consecutive time.
+ if (consec_zeromv > 60 && curr_motion_magn == 0) {
+ return 0;
+ } else {
+ int motion = 1;
+ if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+ if (bsize == SKIN_16X16) {
+ // Take the average of center 2x2 pixels.
+ const int ysource = avg_2x2(y + 7 * stride + 7, stride);
+ const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv);
+ const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv);
+ return vpx_skin_pixel(ysource, usource, vsource, motion);
+ } else {
+ int num_skin = 0;
+ int i, j;
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ // Take the average of center 2x2 pixels.
+ const int ysource = avg_2x2(y + 3 * stride + 3, stride);
+ const int usource = avg_2x2(u + strideuv + 1, strideuv);
+ const int vsource = avg_2x2(v + strideuv + 1, strideuv);
+ num_skin += vpx_skin_pixel(ysource, usource, vsource, motion);
+ if (num_skin >= 2) return 1;
+ y += 8;
+ u += 4;
+ v += 4;
+ }
+ y += (stride << 3) - 16;
+ u += (strideuv << 2) - 8;
+ v += (strideuv << 2) - 8;
+ }
+
+ return 0;
+ }
+ }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
+ int i, j, mb_row, mb_col, num_bl;
+ VP8_COMMON *const cm = &cpi->common;
+ uint8_t *y;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ int offset = 0;
+
+ YV12_BUFFER_CONFIG skinmap;
+ memset(&skinmap, 0, sizeof(skinmap));
+ if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height,
+ VP8BORDERINPIXELS) < 0) {
+ vpx_free_frame_buffer(&skinmap);
+ return;
+ }
+ memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+ y = skinmap.y_buffer;
+ // Loop through blocks and set skin map based on center pixel of block.
+ // Set y to white for skin block, otherwise set to source with gray scale.
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
+ num_bl = 0;
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
+ const int is_skin = cpi->skin_map[offset++];
+ for (i = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++) {
+ y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
+ }
+ }
+ num_bl++;
+ y += 16;
+ src_y += 16;
+ }
+ y += (src_ystride << 4) - (num_bl << 4);
+ src_y += (src_ystride << 4) - (num_bl << 4);
+ }
+ vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
+ vpx_free_frame_buffer(&skinmap);
+}
+#endif // OUTPUT_YUV_SKINMAP
diff --git a/libvpx/vp8/common/vp8_skin_detection.h b/libvpx/vp8/common/vp8_skin_detection.h
new file mode 100644
index 000000000..4d27f5eb2
--- /dev/null
+++ b/libvpx/vp8/common/vp8_skin_detection.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_SKIN_DETECTION_H_
+#define VP8_COMMON_SKIN_DETECTION_H_
+
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+typedef enum {
+ // Skin detection based on 8x8 block. If two of them are identified as skin,
+ // the macroblock is marked as skin.
+ SKIN_8X8,
+ // Skin detection based on 16x16 block.
+ SKIN_16X16
+} SKIN_DETECTION_BLOCK_SIZE;
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv,
+ SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+ int curr_motion_magn);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_SKIN_DETECTION_H_
diff --git a/libvpx/vp8/common/x86/copy_sse2.asm b/libvpx/vp8/common/x86/copy_sse2.asm
index 86fae2695..480faa255 100644
--- a/libvpx/vp8/common/x86/copy_sse2.asm
+++ b/libvpx/vp8/common/x86/copy_sse2.asm
@@ -11,6 +11,7 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
;void vp8_copy32xn_sse2(
; unsigned char *src_ptr,
diff --git a/libvpx/vp8/common/x86/copy_sse3.asm b/libvpx/vp8/common/x86/copy_sse3.asm
index d789a40cc..31ea898a3 100644
--- a/libvpx/vp8/common/x86/copy_sse3.asm
+++ b/libvpx/vp8/common/x86/copy_sse3.asm
@@ -83,6 +83,7 @@
ret
%endmacro
+SECTION .text
;void vp8_copy32xn_sse3(
; unsigned char *src_ptr,
diff --git a/libvpx/vp8/common/x86/dequantize_mmx.asm b/libvpx/vp8/common/x86/dequantize_mmx.asm
index 4e551f00a..bfdd99778 100644
--- a/libvpx/vp8/common/x86/dequantize_mmx.asm
+++ b/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -11,6 +11,7 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
global sym(vp8_dequantize_b_impl_mmx) PRIVATE
diff --git a/libvpx/vp8/common/x86/idctllm_mmx.asm b/libvpx/vp8/common/x86/idctllm_mmx.asm
index 96fa2c60d..5773d9d84 100644
--- a/libvpx/vp8/common/x86/idctllm_mmx.asm
+++ b/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -31,6 +31,7 @@
; *
; **************************************************************************/
+SECTION .text
;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
;int pitch, unsigned char *dest,int stride)
diff --git a/libvpx/vp8/common/x86/idctllm_sse2.asm b/libvpx/vp8/common/x86/idctllm_sse2.asm
index bf8e2c402..560faba00 100644
--- a/libvpx/vp8/common/x86/idctllm_sse2.asm
+++ b/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -19,6 +19,8 @@
; int dst_stride - 3
; )
+SECTION .text
+
global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
sym(vp8_idct_dequant_0_2x_sse2):
push rbp
diff --git a/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libvpx/vp8/common/x86/iwalsh_sse2.asm
index 06e86a80b..82d7bf91a 100644
--- a/libvpx/vp8/common/x86/iwalsh_sse2.asm
+++ b/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
sym(vp8_short_inv_walsh4x4_sse2):
diff --git a/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
index 6d5aaa19d..6a3d05290 100644
--- a/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
+++ b/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -125,6 +125,8 @@
pxor %1, [GLOBAL(t80)]
%endmacro
+SECTION .text
+
;void vp8_loop_filter_bh_y_sse2
;(
; unsigned char *src_ptr,
diff --git a/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/vp8/common/x86/loopfilter_sse2.asm
index 1913abc69..2ae028fea 100644
--- a/libvpx/vp8/common/x86/loopfilter_sse2.asm
+++ b/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -276,6 +276,8 @@
%endmacro
+SECTION .text
+
%if ABI_IS_32BIT
;void vp8_loop_filter_horizontal_edge_sse2
diff --git a/libvpx/vp8/common/x86/mfqe_sse2.asm b/libvpx/vp8/common/x86/mfqe_sse2.asm
index 8177b7922..3fde973ad 100644
--- a/libvpx/vp8/common/x86/mfqe_sse2.asm
+++ b/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;void vp8_filter_by_weight16x16_sse2
;(
; unsigned char *src,
diff --git a/libvpx/vp8/common/x86/recon_mmx.asm b/libvpx/vp8/common/x86/recon_mmx.asm
index 43f2dc6c6..e6a48f6b0 100644
--- a/libvpx/vp8/common/x86/recon_mmx.asm
+++ b/libvpx/vp8/common/x86/recon_mmx.asm
@@ -11,6 +11,7 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
;void copy_mem8x8_mmx(
; unsigned char *src,
diff --git a/libvpx/vp8/common/x86/recon_sse2.asm b/libvpx/vp8/common/x86/recon_sse2.asm
index cb89537f7..57f8899c7 100644
--- a/libvpx/vp8/common/x86/recon_sse2.asm
+++ b/libvpx/vp8/common/x86/recon_sse2.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;void copy_mem16x16_sse2(
; unsigned char *src,
; int src_stride,
diff --git a/libvpx/vp8/common/x86/subpixel_mmx.asm b/libvpx/vp8/common/x86/subpixel_mmx.asm
index 6ab7f1fdc..1f3a2baca 100644
--- a/libvpx/vp8/common/x86/subpixel_mmx.asm
+++ b/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -17,6 +17,7 @@ extern sym(vp8_bilinear_filters_x86_8)
%define vp8_filter_weight 128
%define VP8_FILTER_SHIFT 7
+SECTION .text
;void vp8_filter_block1d_h6_mmx
;(
diff --git a/libvpx/vp8/common/x86/subpixel_sse2.asm b/libvpx/vp8/common/x86/subpixel_sse2.asm
index ca00583ca..6e70f6d2e 100644
--- a/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ b/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -16,6 +16,7 @@ extern sym(vp8_bilinear_filters_x86_8)
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT 7
+SECTION .text
;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
diff --git a/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libvpx/vp8/common/x86/subpixel_ssse3.asm
index 1f6cbd1d1..8d55c9320 100644
--- a/libvpx/vp8/common/x86/subpixel_ssse3.asm
+++ b/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -15,6 +15,7 @@
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT 7
+SECTION .text
;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
diff --git a/libvpx/vp8/decoder/decodeframe.c b/libvpx/vp8/decoder/decodeframe.c
index 0aec2a01b..077bd3da2 100644
--- a/libvpx/vp8/decoder/decodeframe.c
+++ b/libvpx/vp8/decoder/decodeframe.c
@@ -930,7 +930,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
/* When error concealment is enabled we should only check the sync
* code if we have enough bits available
*/
- if (!pbi->ec_active || data + 3 < data_end) {
+ if (data + 3 < data_end) {
if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) {
vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid frame sync code");
@@ -941,13 +941,19 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
* if we have enough data. Otherwise we will end up with the wrong
* size.
*/
- if (!pbi->ec_active || data + 6 < data_end) {
+ if (data + 6 < data_end) {
pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff;
pc->horiz_scale = clear[4] >> 6;
pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff;
pc->vert_scale = clear[6] >> 6;
+ data += 7;
+ } else if (!pbi->ec_active) {
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated key frame header");
+ } else {
+ /* Error concealment is active, clear the frame. */
+ data = data_end;
}
- data += 7;
} else {
memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
@@ -1199,7 +1205,8 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
pbi->frame_corrupt_residual = 0;
#if CONFIG_MULTITHREAD
- if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) {
+ if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) &&
+ pc->multi_token_partition != ONE_PARTITION) {
unsigned int thread;
vp8mt_decode_mb_rows(pbi, xd);
vp8_yv12_extend_frame_borders(yv12_fb_new);
diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c
index b946ab73d..8e9600c6d 100644
--- a/libvpx/vp8/decoder/decodemv.c
+++ b/libvpx/vp8/decoder/decodemv.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "decodemv.h"
#include "treereader.h"
#include "vp8/common/entropymv.h"
#include "vp8/common/entropymode.h"
@@ -64,8 +65,7 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) {
const vp8_prob *const p = (const vp8_prob *)mvc;
int x = 0;
- if (vp8_read(r, p[mvpis_short])) /* Large */
- {
+ if (vp8_read(r, p[mvpis_short])) { /* Large */
int i = 0;
do {
@@ -284,8 +284,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
MB_MODE_INFO *mbmi) {
vp8_reader *const bc = &pbi->mbc[8];
mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra);
- if (mbmi->ref_frame) /* inter MB */
- {
+ if (mbmi->ref_frame) { /* inter MB */
enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
int cnt[4];
int *cntx = cnt;
diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
index 789c2eeff..f516eb0c7 100644
--- a/libvpx/vp8/decoder/onyxd_if.c
+++ b/libvpx/vp8/decoder/onyxd_if.c
@@ -41,7 +41,6 @@
#endif
extern void vp8_init_loop_filter(VP8_COMMON *cm);
-extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
static int get_free_fb(VP8_COMMON *cm);
static void ref_cnt_fb(int *buf, int *idx, int new_idx);
diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h
index 88b1ff16b..5ecacdbb9 100644
--- a/libvpx/vp8/decoder/onyxd_int.h
+++ b/libvpx/vp8/decoder/onyxd_int.h
@@ -68,7 +68,7 @@ typedef struct VP8D_COMP {
#if CONFIG_MULTITHREAD
/* variable for threading */
- int b_multithreaded_rd;
+ vpx_atomic_int b_multithreaded_rd;
int max_threads;
int current_mb_col_main;
unsigned int decoding_thread_count;
@@ -76,9 +76,8 @@ typedef struct VP8D_COMP {
int mt_baseline_filter_level[MAX_MB_SEGMENTS];
int sync_range;
- int *mt_current_mb_col; /* Each row remembers its already decoded column. */
- pthread_mutex_t *pmutex;
- pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */
+ /* Each row remembers its already decoded column. */
+ vpx_atomic_int *mt_current_mb_col;
unsigned char **mt_yabove_row; /* mb_rows x width */
unsigned char **mt_uabove_row;
@@ -119,6 +118,8 @@ typedef struct VP8D_COMP {
void *decrypt_state;
} VP8D_COMP;
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
int vp8_decode_frame(VP8D_COMP *cpi);
int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c
index 9f7751988..d0213f75c 100644
--- a/libvpx/vp8/decoder/threading.c
+++ b/libvpx/vp8/decoder/threading.c
@@ -20,6 +20,7 @@
#include "vp8/common/loopfilter.h"
#include "vp8/common/extend.h"
#include "vpx_ports/vpx_timer.h"
+#include "decoderthreading.h"
#include "detokenize.h"
#include "vp8/common/reconintra4x4.h"
#include "vp8/common/reconinter.h"
@@ -36,8 +37,6 @@
memset((p), 0, (n) * sizeof(*(p))); \
} while (0)
-void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
-
static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
MB_ROW_DEC *mbrd, int count) {
VP8_COMMON *const pc = &pbi->common;
@@ -80,7 +79,8 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
}
- for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1;
+ for (i = 0; i < pc->mb_rows; ++i)
+ vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1);
}
static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -248,12 +248,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
int start_mb_row) {
- const int *last_row_current_mb_col;
- int *current_mb_col;
+ const vpx_atomic_int *last_row_current_mb_col;
+ vpx_atomic_int *current_mb_col;
int mb_row;
VP8_COMMON *pc = &pbi->common;
const int nsync = pbi->sync_range;
- const int first_row_no_sync_above = pc->mb_cols + nsync;
+ const vpx_atomic_int first_row_no_sync_above =
+ VPX_ATOMIC_INIT(pc->mb_cols + nsync);
int num_part = 1 << pbi->common.multi_token_partition;
int last_mb_row = start_mb_row;
@@ -357,13 +358,11 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
if (((mb_col - 1) % nsync) == 0) {
- pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
- protected_write(mutex, current_mb_col, mb_col - 1);
+ vpx_atomic_store_release(current_mb_col, mb_col - 1);
}
if (mb_row && !(mb_col & (nsync - 1))) {
- pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1];
- sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+ vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
}
/* Distance of MB to the various image edges.
@@ -549,7 +548,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
}
/* last MB of row is ready just after extension is done */
- protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+ vpx_atomic_store_release(current_mb_col, mb_col + nsync);
++xd->mode_info_context; /* skip prediction column */
xd->up_available = 1;
@@ -569,10 +568,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
ENTROPY_CONTEXT_PLANES mb_row_left_context;
while (1) {
- if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break;
+ if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
- if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) {
+ if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
break;
} else {
MACROBLOCKD *xd = &mbrd->mbd;
@@ -590,9 +589,8 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
int core_count = 0;
unsigned int ithread;
- pbi->b_multithreaded_rd = 0;
+ vpx_atomic_init(&pbi->b_multithreaded_rd, 0);
pbi->allocated_decoding_thread_count = 0;
- pthread_mutex_init(&pbi->mt_mutex, NULL);
/* limit decoding threads to the max number of token partitions */
core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
@@ -603,7 +601,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
}
if (core_count > 1) {
- pbi->b_multithreaded_rd = 1;
+ vpx_atomic_init(&pbi->b_multithreaded_rd, 1);
pbi->decoding_thread_count = core_count - 1;
CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
@@ -649,16 +647,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) {
int i;
- /* De-allocate mutex */
- if (pbi->pmutex != NULL) {
- for (i = 0; i < mb_rows; ++i) {
- pthread_mutex_destroy(&pbi->pmutex[i]);
- }
-
- vpx_free(pbi->pmutex);
- pbi->pmutex = NULL;
- }
-
vpx_free(pbi->mt_current_mb_col);
pbi->mt_current_mb_col = NULL;
@@ -724,7 +712,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
int i;
int uv_width;
- if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
+ if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
/* our internal buffers are always multiples of 16 */
@@ -742,36 +730,33 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
uv_width = width >> 1;
- /* Allocate mutex */
- CHECK_MEM_ERROR(pbi->pmutex,
- vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows));
- if (pbi->pmutex) {
- for (i = 0; i < pc->mb_rows; ++i) {
- pthread_mutex_init(&pbi->pmutex[i], NULL);
- }
- }
-
- /* Allocate an int for each mb row. */
- CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+ /* Allocate a vpx_atomic_int for each mb row. */
+ CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+ vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
+ for (i = 0; i < pc->mb_rows; ++i)
+ vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
/* Allocate memory for above_row buffers. */
CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i)
- CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
- vpx_memalign(16, sizeof(unsigned char) *
- (width + (VP8BORDERINPIXELS << 1))));
+ CHECK_MEM_ERROR(
+ pbi->mt_yabove_row[i],
+ vpx_memalign(
+ 16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i)
- CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
- vpx_memalign(16, sizeof(unsigned char) *
- (uv_width + VP8BORDERINPIXELS)));
+ CHECK_MEM_ERROR(
+ pbi->mt_uabove_row[i],
+ vpx_memalign(16,
+ sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
for (i = 0; i < pc->mb_rows; ++i)
- CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
- vpx_memalign(16, sizeof(unsigned char) *
- (uv_width + VP8BORDERINPIXELS)));
+ CHECK_MEM_ERROR(
+ pbi->mt_vabove_row[i],
+ vpx_memalign(16,
+ sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
/* Allocate memory for left_col buffers. */
CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
@@ -793,9 +778,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
/* shutdown MB Decoding thread; */
- if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
+ if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
int i;
- protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
+ vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0);
/* allow all threads to exit */
for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
@@ -825,7 +810,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
}
- pthread_mutex_destroy(&pbi->mt_mutex);
}
void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index 7086faae9..8cacb6450 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -500,8 +500,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
}
write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
- } else /* inter coded */
- {
+ } else { /* inter coded */
int_mv best_mv;
vp8_prob mv_ref_p[VP8_MVREFS - 1];
@@ -1416,7 +1415,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
#if CONFIG_MULTITHREAD
- if (cpi->b_multi_threaded) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
pack_mb_row_tokens(cpi, &cpi->bc[1]);
} else {
vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
diff --git a/libvpx/vp8/encoder/bitstream.h b/libvpx/vp8/encoder/bitstream.h
index 2b196dcd2..ed45bff9e 100644
--- a/libvpx/vp8/encoder/bitstream.h
+++ b/libvpx/vp8/encoder/bitstream.h
@@ -15,7 +15,15 @@
extern "C" {
#endif
+#include "vp8/encoder/treewriter.h"
+#include "vp8/encoder/tokenize.h"
+
void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi);
+void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
+ int prob_last, int prob_garf);
+int vp8_estimate_entropy_savings(struct VP8_COMP *cpi);
+void vp8_update_coef_probs(struct VP8_COMP *cpi);
#ifdef __cplusplus
} // extern "C"
diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c
index c7ad3bfe2..9bb0df72d 100644
--- a/libvpx/vp8/encoder/encodeframe.c
+++ b/libvpx/vp8/encoder/encodeframe.c
@@ -11,8 +11,12 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
+#include "bitstream.h"
#include "encodemb.h"
#include "encodemv.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
#include "vp8/common/common.h"
#include "onyx_int.h"
#include "vp8/common/extend.h"
@@ -35,13 +39,6 @@
#include "encodeframe.h"
extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
-extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
- int prob_last, int prob_garf);
-extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
-extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
-extern void vp8_auto_select_speed(VP8_COMP *cpi);
-extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
- MB_ROW_COMP *mbr_ei, int count);
static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
#ifdef MODE_STATS
@@ -344,11 +341,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
#if CONFIG_MULTITHREAD
const int nsync = cpi->mt_sync_range;
- const int rightmost_col = cm->mb_cols + nsync;
- const int *last_row_current_mb_col;
- int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+ vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync);
+ const vpx_atomic_int *last_row_current_mb_col;
+ vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
- if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) {
last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
} else {
last_row_current_mb_col = &rightmost_col;
@@ -418,15 +415,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
#if CONFIG_MULTITHREAD
- if (cpi->b_multi_threaded != 0) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
if (((mb_col - 1) % nsync) == 0) {
- pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
- protected_write(mutex, current_mb_col, mb_col - 1);
+ vpx_atomic_store_release(current_mb_col, mb_col - 1);
}
if (mb_row && !(mb_col & (nsync - 1))) {
- pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
- sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+ vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
}
}
#endif
@@ -566,8 +561,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
#if CONFIG_MULTITHREAD
- if (cpi->b_multi_threaded != 0) {
- protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col);
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+ vpx_atomic_store_release(current_mb_col,
+ vpx_atomic_load_acquire(&rightmost_col));
}
#endif
@@ -752,13 +748,14 @@ void vp8_encode_frame(VP8_COMP *cpi) {
vpx_usec_timer_start(&emr_timer);
#if CONFIG_MULTITHREAD
- if (cpi->b_multi_threaded) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
int i;
vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
cpi->encoding_thread_count);
- for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1;
+ for (i = 0; i < cm->mb_rows; ++i)
+ vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
for (i = 0; i < cpi->encoding_thread_count; ++i) {
sem_post(&cpi->h_event_start_encoding[i]);
diff --git a/libvpx/vp8/encoder/encodeframe.h b/libvpx/vp8/encoder/encodeframe.h
index c1d863492..5274aba41 100644
--- a/libvpx/vp8/encoder/encodeframe.h
+++ b/libvpx/vp8/encoder/encodeframe.h
@@ -10,24 +10,29 @@
#ifndef VP8_ENCODER_ENCODEFRAME_H_
#define VP8_ENCODER_ENCODEFRAME_H_
+#include "vp8/encoder/tokenize.h"
+
#ifdef __cplusplus
extern "C" {
#endif
-extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
-extern void vp8_build_block_offsets(MACROBLOCK *x);
+struct VP8_COMP;
+struct macroblock;
+
+void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x);
+
+void vp8_build_block_offsets(struct macroblock *x);
-extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+void vp8_setup_block_ptrs(struct macroblock *x);
-extern void vp8_encode_frame(VP8_COMP *cpi);
+void vp8_encode_frame(struct VP8_COMP *cpi);
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
- TOKENEXTRA **t, int recon_yoffset,
- int recon_uvoffset, int mb_row,
- int mb_col);
+int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+ TOKENEXTRA **t, int recon_yoffset,
+ int recon_uvoffset, int mb_row, int mb_col);
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
- TOKENEXTRA **t);
+int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+ TOKENEXTRA **t);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c
index 36e9a9078..ea93ccd71 100644
--- a/libvpx/vp8/encoder/encodemv.c
+++ b/libvpx/vp8/encoder/encodemv.c
@@ -25,14 +25,12 @@ static void encode_mvcomponent(vp8_writer *const w, const int v,
const vp8_prob *p = mvc->prob;
const int x = v < 0 ? -v : v;
- if (x < mvnum_short) /* Small */
- {
+ if (x < mvnum_short) { /* Small */
vp8_write(w, 0, p[mvpis_short]);
vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
if (!x) return; /* no sign bit */
- } else /* Large */
- {
+ } else { /* Large */
int i = 0;
vp8_write(w, 1, p[mvpis_short]);
diff --git a/libvpx/vp8/encoder/ethreading.c b/libvpx/vp8/encoder/ethreading.c
index df34997ac..55a1528b1 100644
--- a/libvpx/vp8/encoder/ethreading.c
+++ b/libvpx/vp8/encoder/ethreading.c
@@ -14,6 +14,7 @@
#include "vp8/common/extend.h"
#include "bitstream.h"
#include "encodeframe.h"
+#include "ethreading.h"
#if CONFIG_MULTITHREAD
@@ -25,11 +26,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) {
VP8_COMMON *cm = &cpi->common;
while (1) {
- if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
if (sem_wait(&cpi->h_event_start_lpf) == 0) {
/* we're shutting down */
- if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
vp8_loopfilter_frame(cpi, cm);
@@ -47,7 +48,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
ENTROPY_CONTEXT_PLANES mb_row_left_context;
while (1) {
- if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
const int nsync = cpi->mt_sync_range;
@@ -65,7 +66,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
int *totalrate = &mbri->totalrate;
/* we're shutting down */
- if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1);
xd->mode_info_stride = cm->mode_info_stride;
@@ -79,8 +80,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
int map_index = (mb_row * cm->mb_cols);
- const int *last_row_current_mb_col;
- int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+ const vpx_atomic_int *last_row_current_mb_col;
+ vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
@@ -107,13 +108,11 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
/* for each macroblock col in image */
for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
if (((mb_col - 1) % nsync) == 0) {
- pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
- protected_write(mutex, current_mb_col, mb_col - 1);
+ vpx_atomic_store_release(current_mb_col, mb_col - 1);
}
if (mb_row && !(mb_col & (nsync - 1))) {
- pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
- sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+ vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
}
#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -285,7 +284,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
- protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+ vpx_atomic_store_release(current_mb_col, mb_col + nsync);
/* this is to account for the border */
xd->mode_info_context++;
@@ -489,12 +488,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
const VP8_COMMON *cm = &cpi->common;
- cpi->b_multi_threaded = 0;
+ vpx_atomic_init(&cpi->b_multi_threaded, 0);
cpi->encoding_thread_count = 0;
cpi->b_lpf_running = 0;
- pthread_mutex_init(&cpi->mt_mutex, NULL);
-
if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
int ithread;
int th_count = cpi->oxcf.multi_threaded - 1;
@@ -525,7 +522,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
CHECK_MEM_ERROR(cpi->en_thread_data,
vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
- cpi->b_multi_threaded = 1;
+ vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
cpi->encoding_thread_count = th_count;
/*
@@ -554,7 +551,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
if (rc) {
/* shutdown other threads */
- protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+ vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
pthread_join(cpi->h_encoding_thread[ithread], 0);
sem_destroy(&cpi->h_event_start_encoding[ithread]);
@@ -568,8 +565,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
vpx_free(cpi->mb_row_ei);
vpx_free(cpi->en_thread_data);
- pthread_mutex_destroy(&cpi->mt_mutex);
-
return -1;
}
@@ -584,7 +579,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
if (rc) {
/* shutdown other threads */
- protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+ vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
sem_post(&cpi->h_event_start_encoding[ithread]);
sem_post(&cpi->h_event_end_encoding[ithread]);
@@ -602,8 +597,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
vpx_free(cpi->mb_row_ei);
vpx_free(cpi->en_thread_data);
- pthread_mutex_destroy(&cpi->mt_mutex);
-
return -2;
}
}
@@ -612,9 +605,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
}
void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
- if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
/* shutdown other threads */
- protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+ vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
{
int i;
@@ -642,6 +635,5 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
vpx_free(cpi->mb_row_ei);
vpx_free(cpi->en_thread_data);
}
- pthread_mutex_destroy(&cpi->mt_mutex);
}
#endif
diff --git a/libvpx/vp8/encoder/ethreading.h b/libvpx/vp8/encoder/ethreading.h
new file mode 100644
index 000000000..95bf73d18
--- /dev/null
+++ b/libvpx/vp8/encoder/ethreading.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_ETHREADING_H_
+#define VP8_ENCODER_ETHREADING_H_
+
+#include "vp8/encoder/onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct macroblock;
+
+void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x,
+ MB_ROW_COMP *mbr_ei, int count);
+int vp8cx_create_encoder_threads(struct VP8_COMP *cpi);
+void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VP8_ENCODER_ETHREADING_H_
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
index caf19059e..70f924341 100644
--- a/libvpx/vp8/encoder/firstpass.c
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -1273,8 +1273,9 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
* sum duration is not. Its calculated based on the actual durations of
* all frames from the first pass.
*/
- vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
- cpi->twopass.total_stats.duration);
+ vp8_new_framerate(cpi,
+ 10000000.0 * cpi->twopass.total_stats.count /
+ cpi->twopass.total_stats.duration);
cpi->output_framerate = cpi->framerate;
cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
index b4a49a3b1..970120f3b 100644
--- a/libvpx/vp8/encoder/mcomp.c
+++ b/libvpx/vp8/encoder/mcomp.c
@@ -34,22 +34,19 @@ int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) {
* NEAREST for subsequent blocks. The "Weight" parameter allows, to a
* limited extent, for some account to be taken of these factors.
*/
- const int mv_idx_row =
- clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
- const int mv_idx_col =
- clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
- return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7;
+ return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
+ mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
+ Weight) >>
+ 7;
}
static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2],
int error_per_bit) {
/* Ignore mv costing if mvcost is NULL */
if (mvcost) {
- const int mv_idx_row =
- clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
- const int mv_idx_col =
- clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
- return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit +
+ return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
+ mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
+ error_per_bit +
128) >>
8;
}
diff --git a/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
new file mode 100644
index 000000000..1f60a692d
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+/* clang-format off */
+/* TRANSPOSE_4H: transpose 4x4 matrix.
+ Input: ftmp1,ftmp2,ftmp3,ftmp4
+ Output: ftmp1,ftmp2,ftmp3,ftmp4
+ Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
+ */
+#define TRANSPOSE_4H \
+ MMI_LI(%[tmp0], 0x93) \
+ "mtc1 %[tmp0], %[ftmp10] \n\t" \
+ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
+ "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
+ "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
+ "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+/* clang-format on */
+
+void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ uint64_t tmp[1];
+ int16_t *ip = input;
+
+#if _MIPS_SIM == _ABIO32
+ register double ftmp0 asm("$f0");
+ register double ftmp1 asm("$f2");
+ register double ftmp2 asm("$f4");
+ register double ftmp3 asm("$f6");
+ register double ftmp4 asm("$f8");
+ register double ftmp5 asm("$f10");
+ register double ftmp6 asm("$f12");
+ register double ftmp7 asm("$f14");
+ register double ftmp8 asm("$f16");
+ register double ftmp9 asm("$f18");
+ register double ftmp10 asm("$f20");
+ register double ftmp11 asm("$f22");
+ register double ftmp12 asm("$f24");
+#else
+ register double ftmp0 asm("$f0");
+ register double ftmp1 asm("$f1");
+ register double ftmp2 asm("$f2");
+ register double ftmp3 asm("$f3");
+ register double ftmp4 asm("$f4");
+ register double ftmp5 asm("$f5");
+ register double ftmp6 asm("$f6");
+ register double ftmp7 asm("$f7");
+ register double ftmp8 asm("$f8");
+ register double ftmp9 asm("$f9");
+ register double ftmp10 asm("$f10");
+ register double ftmp11 asm("$f11");
+ register double ftmp12 asm("$f12");
+#endif // _MIPS_SIM == _ABIO32
+
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ TRANSPOSE_4H
+
+ "ldc1 %[ftmp11], %[ff_ph_8] \n\t"
+ // f1 + f4
+ "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
+ // a1
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ // f2 + f3
+ "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
+ // b1
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ // f2 - f3
+ "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
+ // c1
+ "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
+ // f1 - f4
+ "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
+ // d1
+ "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+ // op[0] = a1 + b1
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ // op[2] = a1 - b1
+ "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
+
+ // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
+ MMI_LI(%[tmp0], 0x0c)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "ldc1 %[ftmp12], %[ff_pw_14500] \n\t"
+ "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+
+ // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
+ "ldc1 %[ftmp12], %[ff_pw_7500] \n\t"
+ "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ TRANSPOSE_4H
+
+ "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
+ "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
+ "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
+
+ "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t"
+ "ldc1 %[ftmp9], %[ff_ph_01] \n\t"
+ "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "ldc1 %[ftmp9], %[ff_ph_07] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ MMI_LI(%[tmp0], 0x04)
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+
+ MMI_LI(%[tmp0], 0x10)
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "ldc1 %[ftmp12], %[ff_pw_12000] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+
+ "ldc1 %[ftmp12], %[ff_pw_51000] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t"
+
+ "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t"
+ "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t"
+ "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t"
+ "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t"
+ "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t"
+
+ : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
+ [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
+ [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
+ [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
+ [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+ : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
+ [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
+ [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
+ [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
+ [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
+ [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
+ : "memory"
+ );
+}
+
+void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ vp8_short_fdct4x4_mmi(input, output, pitch);
+ vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ double ftmp[13];
+ uint32_t tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+
+ __asm__ volatile (
+ MMI_LI(%[tmp0], 0x02)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
+ TRANSPOSE_4H
+
+ "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+ // a
+ "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ // d
+ "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
+ // c
+ "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
+ // b
+ "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t"
+
+ // a + d
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ // b + c
+ "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t"
+ // b - c
+ "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t"
+ // a - d
+ "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+
+ "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
+ TRANSPOSE_4H
+
+ // op[2], op[0]
+ "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t"
+ // op[3], op[1]
+ "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t"
+
+ // op[6], op[4]
+ "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t"
+ // op[7], op[5]
+ "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t"
+
+ // op[10], op[8]
+ "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t"
+ // op[11], op[9]
+ "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t"
+
+ // op[14], op[12]
+ "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t"
+ // op[15], op[13]
+ "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t"
+
+ // a1, a3
+ "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t"
+ // d1, d3
+ "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t"
+ // c1, c3
+ "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t"
+ // b1, b3
+ "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t"
+
+ // a1 + d1, a3 + d3
+ "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t"
+ // b1 + c1, b3 + c3
+ "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t"
+ // b1 - c1, b3 - c3
+ "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ // a1 - d1, a3 - d3
+ "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
+
+ // a2, a4
+ "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t"
+ // d2, d4
+ "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t"
+ // c2, c4
+ "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t"
+ // b2, b4
+ "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t"
+
+ // a2 + d2, a4 + d4
+ "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ // b2 + c2, b4 + c4
+ "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t"
+ // b2 - c2, b4 - c4
+ "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t"
+ // a2 - d2, a4 - d4
+ "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
+
+ MMI_LI(%[tmp0], 0x03)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t"
+ "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t"
+ "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t"
+ "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
+ "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t"
+ "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t"
+ "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t"
+ "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t"
+ "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+
+ "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+
+ MMI_LI(%[tmp0], 0x72)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
+ "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
+ "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
+ "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
+ "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]),
+ [tmp0]"=&r"(tmp[0]),
+ [ip]"+&r"(input)
+ : [op]"r"(output),
+ [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch),
+ [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask),
+ [ff_ph_01]"f"(ff_ph_01)
+ : "memory"
+ );
+}
diff --git a/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
new file mode 100644
index 000000000..3ccb196ff
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define REGULAR_SELECT_EOB(i, rc) \
+ z = coeff_ptr[rc]; \
+ sz = (z >> 31); \
+ x = (z ^ sz) - sz; \
+ zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \
+ if (x >= zbin) { \
+ x += round_ptr[rc]; \
+ y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
+ if (y) { \
+ x = (y ^ sz) - sz; \
+ qcoeff_ptr[rc] = x; \
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \
+ eob = i; \
+ zbin_boost_ptr = b->zrun_zbin_boost; \
+ } \
+ }
+
+void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+ const int16_t *coeff_ptr = b->coeff;
+ const int16_t *round_ptr = b->round;
+ const int16_t *quant_ptr = b->quant_fast;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ const int16_t *dequant_ptr = d->dequant;
+ const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
+
+ double ftmp[13];
+ uint64_t tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
+ int eob = 0;
+
+ __asm__ volatile(
+ // loop 0 ~ 7
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t"
+ "li %[tmp0], 0x0f \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t"
+
+ "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
+ "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+ "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
+ "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t"
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t"
+ "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t"
+ "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+
+ "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
+ "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t"
+ "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ones] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ones] \n\t"
+ "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+ "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t"
+
+ // loop 8 ~ 15
+ "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t"
+
+ "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
+ "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+ "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
+ "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t"
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t"
+ "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t"
+ "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+
+ "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
+ "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t"
+ "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "xor %[ftmp5], %[ftmp5], %[ones] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ones] \n\t"
+ "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+ "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t"
+
+ "li %[tmp0], 0x10 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "li %[tmp0], 0xaa \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "li %[tmp0], 0xffff \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "and %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t"
+ "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [coeff_ptr] "r"((mips_reg)coeff_ptr),
+ [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
+ [dequant_ptr] "r"((mips_reg)dequant_ptr),
+ [round_ptr] "r"((mips_reg)round_ptr),
+ [quant_ptr] "r"((mips_reg)quant_ptr),
+ [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
+ [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
+ [ones] "f"(ones)
+ : "memory");
+
+ *d->eob = eob;
+}
+
+void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+ int eob = 0;
+ int x, y, z, sz, zbin;
+ const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+ const int16_t *coeff_ptr = b->coeff;
+ const int16_t *zbin_ptr = b->zbin;
+ const int16_t *round_ptr = b->round;
+ const int16_t *quant_ptr = b->quant;
+ const int16_t *quant_shift_ptr = b->quant_shift;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ const int16_t *dequant_ptr = d->dequant;
+ const int16_t zbin_oq_value = b->zbin_extra;
+ register double ftmp0 asm("$f0");
+
+ // memset(qcoeff_ptr, 0, 32);
+ // memset(dqcoeff_ptr, 0, 32);
+ /* clang-format off */
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t"
+
+ "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t"
+ : [ftmp0]"=&f"(ftmp0)
+ : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr)
+ : "memory"
+ );
+ /* clang-format on */
+
+ REGULAR_SELECT_EOB(1, 0);
+ REGULAR_SELECT_EOB(2, 1);
+ REGULAR_SELECT_EOB(3, 4);
+ REGULAR_SELECT_EOB(4, 8);
+ REGULAR_SELECT_EOB(5, 5);
+ REGULAR_SELECT_EOB(6, 2);
+ REGULAR_SELECT_EOB(7, 3);
+ REGULAR_SELECT_EOB(8, 6);
+ REGULAR_SELECT_EOB(9, 9);
+ REGULAR_SELECT_EOB(10, 12);
+ REGULAR_SELECT_EOB(11, 13);
+ REGULAR_SELECT_EOB(12, 10);
+ REGULAR_SELECT_EOB(13, 7);
+ REGULAR_SELECT_EOB(14, 11);
+ REGULAR_SELECT_EOB(15, 14);
+ REGULAR_SELECT_EOB(16, 15);
+
+ *d->eob = (char)eob;
+}
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index b571d29d9..224318242 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -12,10 +12,12 @@
#include "./vpx_scale_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vp8_rtcd.h"
+#include "bitstream.h"
#include "vp8/common/onyxc_int.h"
#include "vp8/common/blockd.h"
#include "onyx_int.h"
#include "vp8/common/systemdependent.h"
+#include "vp8/common/vp8_skin_detection.h"
#include "vp8/encoder/quantize.h"
#include "vp8/common/alloccommon.h"
#include "mcomp.h"
@@ -35,6 +37,7 @@
#include "vp8/common/threading.h"
#include "vpx_ports/system_state.h"
#include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
#if ARCH_ARM
#include "vpx_ports/arm.h"
#endif
@@ -42,6 +45,13 @@
#include "mr_dissim.h"
#endif
#include "encodeframe.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
+#include "picklpf.h"
+#if !CONFIG_REALTIME_ONLY
+#include "temporal_filter.h"
+#endif
#include <assert.h>
#include <math.h>
@@ -50,28 +60,17 @@
#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
extern int vp8_update_coef_context(VP8_COMP *cpi);
-extern void vp8_update_coef_probs(VP8_COMP *cpi);
#endif
-extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
-extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-
extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post, int filt_lvl,
int low_var_thresh, int flag);
extern void print_parms(VP8_CONFIG *ocf, char *filenam);
extern unsigned int vp8_get_processor_freq();
extern void print_tree_update_probs();
-extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
-extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
-
-int vp8_estimate_entropy_savings(VP8_COMP *cpi);
int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
-
static void set_default_lf_deltas(VP8_COMP *cpi);
extern const int vp8_gf_interval_table[101];
@@ -87,6 +86,9 @@ FILE *yuv_file;
#ifdef OUTPUT_YUV_DENOISED
FILE *yuv_denoised_file;
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+static FILE *yuv_skinmap_file = NULL;
+#endif
#if 0
FILE *framepsnr;
@@ -219,7 +221,8 @@ static void save_layer_context(VP8_COMP *cpi) {
lc->inter_frame_target = cpi->inter_frame_target;
lc->total_byte_count = cpi->total_byte_count;
lc->filter_level = cpi->common.filter_level;
-
+ lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot;
+ lc->force_maxqp = cpi->force_maxqp;
lc->last_frame_percent_intra = cpi->last_frame_percent_intra;
memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage,
@@ -255,7 +258,8 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) {
cpi->inter_frame_target = lc->inter_frame_target;
cpi->total_byte_count = lc->total_byte_count;
cpi->common.filter_level = lc->filter_level;
-
+ cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot;
+ cpi->force_maxqp = lc->force_maxqp;
cpi->last_frame_percent_intra = lc->last_frame_percent_intra;
memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage,
@@ -447,18 +451,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
cpi->mb.pip = 0;
#if CONFIG_MULTITHREAD
- /* De-allocate mutex */
- if (cpi->pmutex != NULL) {
- VP8_COMMON *const pc = &cpi->common;
- int i;
-
- for (i = 0; i < pc->mb_rows; ++i) {
- pthread_mutex_destroy(&cpi->pmutex[i]);
- }
- vpx_free(cpi->pmutex);
- cpi->pmutex = NULL;
- }
-
vpx_free(cpi->mt_current_mb_col);
cpi->mt_current_mb_col = NULL;
#endif
@@ -616,6 +608,59 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) {
set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
}
+static void compute_skin_map(VP8_COMP *cpi) {
+ int mb_row, mb_col, num_bl;
+ VP8_COMMON *cm = &cpi->common;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const int src_uvstride = cpi->Source->uv_stride;
+
+ const SKIN_DETECTION_BLOCK_SIZE bsize =
+ (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ num_bl = 0;
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ const int bl_index = mb_row * cm->mb_cols + mb_col;
+ cpi->skin_map[bl_index] =
+ vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+ bsize, cpi->consec_zero_last[bl_index], 0);
+ num_bl++;
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+ src_y += (src_ystride << 4) - (num_bl << 4);
+ src_u += (src_uvstride << 3) - (num_bl << 3);
+ src_v += (src_uvstride << 3) - (num_bl << 3);
+ }
+
+ // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+ // non-skin blocks (all of its neighbors are skin). Skip the boundary.
+ for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) {
+ for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) {
+ const int bl_index = mb_row * cm->mb_cols + mb_col;
+ int num_neighbor = 0;
+ int mi, mj;
+ int non_skin_threshold = 8;
+
+ for (mi = -1; mi <= 1; mi += 1) {
+ for (mj = -1; mj <= 1; mj += 1) {
+ int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj;
+ if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+ }
+ }
+
+ if (cpi->skin_map[bl_index] && num_neighbor < 2)
+ cpi->skin_map[bl_index] = 0;
+ if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+ cpi->skin_map[bl_index] = 1;
+ }
+ }
+}
+
static void set_default_lf_deltas(VP8_COMP *cpi) {
cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -1096,9 +1141,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
int width = cm->Width;
int height = cm->Height;
-#if CONFIG_MULTITHREAD
- int prev_mb_rows = cm->mb_rows;
-#endif
if (vp8_alloc_frame_buffers(cm, width, height)) {
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1190,26 +1232,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
if (cpi->oxcf.multi_threaded > 1) {
int i;
- /* De-allocate and re-allocate mutex */
- if (cpi->pmutex != NULL) {
- for (i = 0; i < prev_mb_rows; ++i) {
- pthread_mutex_destroy(&cpi->pmutex[i]);
- }
- vpx_free(cpi->pmutex);
- cpi->pmutex = NULL;
- }
-
- CHECK_MEM_ERROR(cpi->pmutex,
- vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows));
- if (cpi->pmutex) {
- for (i = 0; i < cm->mb_rows; ++i) {
- pthread_mutex_init(&cpi->pmutex[i], NULL);
- }
- }
-
vpx_free(cpi->mt_current_mb_col);
CHECK_MEM_ERROR(cpi->mt_current_mb_col,
vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+ for (i = 0; i < cm->mb_rows; ++i)
+ vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
}
#endif
@@ -1526,9 +1553,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
setup_features(cpi);
- {
+ if (!cpi->use_roi_static_threshold) {
int i;
-
for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
}
@@ -1788,6 +1814,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->active_map_enabled = 0;
+ cpi->use_roi_static_threshold = 0;
+
#if 0
/* Experimental code for lagged and one pass */
/* Initialise one_pass GF frames stats */
@@ -1857,6 +1885,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->cyclic_refresh_map = (signed char *)NULL;
}
+ CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
+ sizeof(cpi->skin_map[0])));
+
CHECK_MEM_ERROR(cpi->consec_zero_last,
vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
@@ -1880,6 +1911,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
cpi->common.refresh_alt_ref_frame = 0;
cpi->force_maxqp = 0;
+ cpi->frames_since_last_drop_overshoot = 0;
cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
#if CONFIG_INTERNAL_STATS
@@ -1933,6 +1965,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
#ifdef OUTPUT_YUV_DENOISED
yuv_denoised_file = fopen("denoised.yuv", "ab");
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ yuv_skinmap_file = fopen("skinmap.yuv", "wb");
+#endif
#if 0
framepsnr = fopen("framepsnr.stt", "a");
@@ -2284,6 +2319,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
dealloc_compressor_data(cpi);
vpx_free(cpi->mb.ss);
vpx_free(cpi->tok);
+ vpx_free(cpi->skin_map);
vpx_free(cpi->cyclic_refresh_map);
vpx_free(cpi->consec_zero_last);
vpx_free(cpi->consec_zero_last_mvbias);
@@ -2298,6 +2334,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
#ifdef OUTPUT_YUV_DENOISED
fclose(yuv_denoised_file);
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ fclose(yuv_skinmap_file);
+#endif
#if 0
@@ -2474,34 +2513,6 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) {
return 0;
}
-#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
-void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
- unsigned char *src = s->y_buffer;
- int h = s->y_height;
-
- do {
- fwrite(src, s->y_width, 1, yuv_file);
- src += s->y_stride;
- } while (--h);
-
- src = s->u_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_file);
- src += s->uv_stride;
- } while (--h);
-
- src = s->v_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_file);
- src += s->uv_stride;
- } while (--h);
-}
-#endif
-
static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
VP8_COMMON *cm = &cpi->common;
@@ -2914,8 +2925,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
- } else /* For non key frames */
- {
+ } else {
if (cm->refresh_alt_ref_frame) {
assert(!cm->copy_buffer_to_arf);
@@ -2936,8 +2946,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
cpi->current_ref_frames[ALTREF_FRAME] =
cpi->current_ref_frames[LAST_FRAME];
}
- } else /* if (cm->copy_buffer_to_arf == 2) */
- {
+ } else {
if (cm->alt_fb_idx != cm->gld_fb_idx) {
yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
@@ -2969,8 +2978,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
cpi->current_ref_frames[GOLDEN_FRAME] =
cpi->current_ref_frames[LAST_FRAME];
}
- } else /* if (cm->copy_buffer_to_gf == 2) */
- {
+ } else {
if (cm->alt_fb_idx != cm->gld_fb_idx) {
yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
@@ -3001,8 +3009,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
int i;
for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]);
- } else /* For non key frames */
- {
+ } else {
vp8_yv12_extend_frame_borders(
&cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
@@ -3234,7 +3241,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
}
#if CONFIG_MULTITHREAD
- if (cpi->b_multi_threaded) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
}
#endif
@@ -3788,6 +3795,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
}
#endif
+ compute_skin_map(cpi);
+
/* Setup background Q adjustment for error resilient mode.
* For multi-layer encodes only enable this for the base layer.
*/
@@ -3861,7 +3870,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
#endif
#ifdef OUTPUT_YUV_SRC
- vp8_write_yuv_frame(yuv_file, cpi->Source);
+ vpx_write_yuv_frame(yuv_file, cpi->Source);
#endif
do {
@@ -3989,7 +3998,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
#else
/* transform / motion compensation build reconstruction frame */
vp8_encode_frame(cpi);
- if (cpi->oxcf.screen_content_mode == 2) {
+
+ if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
if (vp8_drop_encodedframe_overshoot(cpi, Q)) return;
}
@@ -4421,11 +4431,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
}
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ if (cpi->common.current_video_frame > 1) {
+ vp8_compute_skin_map(cpi, yuv_skinmap_file);
+ }
+#endif
+
#if CONFIG_MULTITHREAD
- if (cpi->b_multi_threaded) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
/* start loopfilter in separate thread */
sem_post(&cpi->h_event_start_lpf);
cpi->b_lpf_running = 1;
+ /* wait for the filter_level to be picked so that we can continue with
+ * stream packing */
+ sem_wait(&cpi->h_event_end_lpf);
} else
#endif
{
@@ -4435,7 +4454,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
update_reference_frames(cpi);
#ifdef OUTPUT_YUV_DENOISED
- vp8_write_yuv_frame(yuv_denoised_file,
+ vpx_write_yuv_frame(yuv_denoised_file,
&cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
#endif
@@ -4445,12 +4464,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
}
#endif
-#if CONFIG_MULTITHREAD
- /* wait that filter_level is picked so that we can continue with stream
- * packing */
- if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf);
-#endif
-
/* build the bitstream */
vp8_pack_bitstream(cpi, dest, dest_end, size);
@@ -4784,7 +4797,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
#endif
/* DEBUG */
- /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
+ /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
}
#if !CONFIG_REALTIME_ONLY
static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
@@ -5292,7 +5305,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
#if CONFIG_MULTITHREAD
/* wait for the lpf thread done */
- if (cpi->b_multi_threaded && cpi->b_lpf_running) {
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
sem_wait(&cpi->h_event_end_lpf);
cpi->b_lpf_running = 0;
}
@@ -5338,9 +5351,6 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
const int range = 63;
int i;
- // This method is currently incompatible with the cyclic refresh method
- if (cpi->cyclic_refresh_mode_enabled) return -1;
-
// Check number of rows and columns match
if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) {
return -1;
@@ -5359,7 +5369,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
return -1;
}
- if (!map) {
+ // Also disable segmentation if no deltas are specified.
+ if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 &&
+ delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 &&
+ delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 &&
+ threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) {
disable_segmentation(cpi);
return 0;
}
@@ -5396,6 +5410,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
/* Initialise the feature data structure */
set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+ if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 ||
+ threshold[3] != 0)
+ cpi->use_roi_static_threshold = 1;
+ cpi->cyclic_refresh_mode_enabled = 0;
+
return 0;
}
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index fe775064a..c489b46c2 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -249,6 +249,10 @@ typedef struct {
int filter_level;
+ int frames_since_last_drop_overshoot;
+
+ int force_maxqp;
+
int last_frame_percent_intra;
int count_mb_ref_frame_usage[MAX_REF_FRAMES];
@@ -471,6 +475,8 @@ typedef struct VP8_COMP {
int zeromv_count;
int lf_zeromv_pct;
+ unsigned char *skin_map;
+
unsigned char *segmentation_map;
signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
int segment_encode_breakout[MAX_MB_SEGMENTS];
@@ -503,6 +509,7 @@ typedef struct VP8_COMP {
int mse_source_denoised;
int force_maxqp;
+ int frames_since_last_drop_overshoot;
// GF update for 1 pass cbr.
int gf_update_onepass_cbr;
@@ -511,11 +518,9 @@ typedef struct VP8_COMP {
#if CONFIG_MULTITHREAD
/* multithread data */
- pthread_mutex_t *pmutex;
- pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */
- int *mt_current_mb_col;
+ vpx_atomic_int *mt_current_mb_col;
int mt_sync_range;
- int b_multi_threaded;
+ vpx_atomic_int b_multi_threaded;
int encoding_thread_count;
int b_lpf_running;
@@ -687,6 +692,9 @@ typedef struct VP8_COMP {
int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
[MAX_ENTROPY_TOKENS];
} rd_costs;
+
+ // Use the static threshold from ROI settings.
+ int use_roi_static_threshold;
} VP8_COMP;
void vp8_initialize_enc(void);
diff --git a/libvpx/vp8/encoder/pickinter.c b/libvpx/vp8/encoder/pickinter.c
index eb713f11c..a9943eb6a 100644
--- a/libvpx/vp8/encoder/pickinter.c
+++ b/libvpx/vp8/encoder/pickinter.c
@@ -25,6 +25,7 @@
#include "vp8/common/reconintra4x4.h"
#include "vpx_dsp/variance.h"
#include "mcomp.h"
+#include "vp8/common/vp8_skin_detection.h"
#include "rdopt.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
@@ -36,82 +37,9 @@
extern unsigned int cnt_pm;
#endif
-#define MODEL_MODE 1
-
extern const int vp8_ref_frame_order[MAX_MODES];
extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
-// Fixed point implementation of a skin color classifier. Skin color
-// is model by a Gaussian distribution in the CbCr color space.
-// See ../../test/skin_color_detector_test.cc where the reference
-// skin color classifier is defined.
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
- { 6400, 10240 },
- { 7040, 10240 },
- { 8320, 9280 },
- { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
- 800000, 800000, 800000 }; // q18
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
- const int cb_q6 = cb << 6;
- const int cr_q6 = cr << 6;
- const int cb_diff_q12 =
- (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
- const int cbcr_diff_q12 =
- (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
- const int cr_diff_q12 =
- (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
- const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
- const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
- const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
- const int skin_diff =
- skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
- skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
- return skin_diff;
-}
-
-// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr, int consec_zeromv) {
- if (y < 40 || y > 220) {
- return 0;
- } else {
- if (MODEL_MODE == 0) {
- return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
- } else {
- int i = 0;
- // No skin if block has been zero motion for long consecutive time.
- if (consec_zeromv > 60) return 0;
- // Exit on grey.
- if (cb == 128 && cr == 128) return 0;
- // Exit on very strong cb.
- if (cb > 150 && cr < 110) return 0;
- for (; i < 5; ++i) {
- int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
- if (skin_color_diff < skin_threshold[i + 1]) {
- if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
- return 0;
- } else if (consec_zeromv > 25 &&
- skin_color_diff > (skin_threshold[i + 1] >> 1)) {
- return 0;
- } else {
- return 1;
- }
- }
- // Exit if difference is much large than the threshold.
- if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
- return 0;
- }
- }
- return 0;
- }
- }
-}
-
static int macroblock_corner_grad(unsigned char *signal, int stride,
int offsetx, int offsety, int sgnx,
int sgny) {
@@ -760,27 +688,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
#endif
// Check if current macroblock is in skin area.
- {
- const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
- x->src.y_buffer[7 * x->src.y_stride + 8] +
- x->src.y_buffer[8 * x->src.y_stride + 7] +
- x->src.y_buffer[8 * x->src.y_stride + 8]) >>
- 2;
- const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
- x->src.u_buffer[3 * x->src.uv_stride + 4] +
- x->src.u_buffer[4 * x->src.uv_stride + 3] +
- x->src.u_buffer[4 * x->src.uv_stride + 4]) >>
- 2;
- const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
- x->src.v_buffer[3 * x->src.uv_stride + 4] +
- x->src.v_buffer[4 * x->src.uv_stride + 3] +
- x->src.v_buffer[4 * x->src.uv_stride + 4]) >>
- 2;
- x->is_skin = 0;
- if (!cpi->oxcf.screen_content_mode) {
- int block_index = mb_row * cpi->common.mb_cols + mb_col;
- x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
- }
+ x->is_skin = 0;
+ if (!cpi->oxcf.screen_content_mode) {
+ int block_index = mb_row * cpi->common.mb_cols + mb_col;
+ x->is_skin = cpi->skin_map[block_index];
}
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity) {
diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c
index 6f287322e..b1b712db9 100644
--- a/libvpx/vp8/encoder/picklpf.c
+++ b/libvpx/vp8/encoder/picklpf.c
@@ -12,6 +12,7 @@
#include "./vpx_scale_rtcd.h"
#include "vp8/common/onyxc_int.h"
#include "onyx_int.h"
+#include "vp8/encoder/picklpf.h"
#include "vp8/encoder/quantize.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/vpx_scale.h"
diff --git a/libvpx/vp8/encoder/picklpf.h b/libvpx/vp8/encoder/picklpf.h
new file mode 100644
index 000000000..e6ad0dbf2
--- /dev/null
+++ b/libvpx/vp8/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_PICKLPF_H_
+#define VP8_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct yv12_buffer_config;
+
+void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd,
+ struct VP8_COMP *cpi);
+void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val);
+void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VP8_ENCODER_PICKLPF_H_
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
index e89247ae4..e58c31098 100644
--- a/libvpx/vp8/encoder/ratectrl.c
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -498,11 +498,9 @@ static void calc_gf_params(VP8_COMP *cpi) {
* This is updated once the real frame size/boost is known.
*/
if (cpi->oxcf.fixed_q == -1) {
- if (cpi->pass == 2) /* 2 Pass */
- {
+ if (cpi->pass == 2) { /* 2 Pass */
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
- } else /* 1 Pass */
- {
+ } else { /* 1 Pass */
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++;
@@ -1442,12 +1440,33 @@ int vp8_pick_frame_size(VP8_COMP *cpi) {
// If this just encoded frame (mcomp/transform/quant, but before loopfilter and
// pack_bitstream) has large overshoot, and was not being encoded close to the
// max QP, then drop this frame and force next frame to be encoded at max QP.
-// Condition this on 1 pass CBR with screen content mode and frame dropper off.
+// Allow this for screen_content_mode = 2, or if drop frames is allowed.
// TODO(marpan): Should do this exit condition during the encode_frame
// (i.e., halfway during the encoding of the frame) to save cycles.
int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
- if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
- cpi->drop_frames_allowed == 0 && cpi->common.frame_type != KEY_FRAME) {
+ int force_drop_overshoot = 0;
+#if CONFIG_MULTI_RES_ENCODING
+ // Only check for dropping due to overshoot on the lowest stream.
+ // If the lowest stream of the multi-res encoding was dropped due to
+ // overshoot, then force dropping on all upper layer streams
+ // (mr_encoder_id > 0).
+ LOWER_RES_FRAME_INFO *low_res_frame_info =
+ (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
+ if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) {
+ force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp;
+ if (!force_drop_overshoot) {
+ cpi->force_maxqp = 0;
+ cpi->frames_since_last_drop_overshoot++;
+ return 0;
+ }
+ }
+#endif
+ if (cpi->common.frame_type != KEY_FRAME &&
+ (cpi->oxcf.screen_content_mode == 2 ||
+ (cpi->drop_frames_allowed &&
+ (force_drop_overshoot ||
+ (cpi->rate_correction_factor < (4.0f * MIN_BPB_FACTOR) &&
+ cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) {
// Note: the "projected_frame_size" from encode_frame() only gives estimate
// of mode/motion vector rate (in non-rd mode): so below we only require
// that projected_frame_size is somewhat greater than per-frame-bandwidth,
@@ -1458,17 +1477,20 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
// Rate threshold, in bytes.
int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
// Threshold for the average (over all macroblocks) of the pixel-sum
- // residual error over 16x16 block. Should add QP dependence on threshold?
- int thresh_pred_err_mb = (256 << 4);
+ // residual error over 16x16 block.
+ int thresh_pred_err_mb = (200 << 4);
int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
- if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
- pred_err_mb > thresh_pred_err_mb) {
+ // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold,
+ // give more weight to pred_err metric for overshoot detection.
+ if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4))
+ thresh_rate = thresh_rate >> 3;
+ if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
+ pred_err_mb > thresh_pred_err_mb) ||
+ force_drop_overshoot) {
+ unsigned int i;
double new_correction_factor;
- const int target_size = cpi->av_per_frame_bandwidth;
int target_bits_per_mb;
- // Drop this frame: advance frame counters, and set force_maxqp flag.
- cpi->common.current_video_frame++;
- cpi->frames_since_key++;
+ const int target_size = cpi->av_per_frame_bandwidth;
// Flag to indicate we will force next frame to be encoded at max QP.
cpi->force_maxqp = 1;
// Reset the buffer levels.
@@ -1499,14 +1521,40 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
if (cpi->rate_correction_factor > MAX_BPB_FACTOR) {
cpi->rate_correction_factor = MAX_BPB_FACTOR;
}
+ // Drop this frame: update frame counters.
+ cpi->common.current_video_frame++;
+ cpi->frames_since_key++;
+ cpi->temporal_pattern_counter++;
+ cpi->frames_since_last_drop_overshoot = 0;
+ if (cpi->oxcf.number_of_layers > 1) {
+ // Set max_qp and rate correction for all temporal layers if overshoot
+ // is detected.
+ for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+ lc->force_maxqp = 1;
+ lc->frames_since_last_drop_overshoot = 0;
+ lc->rate_correction_factor = cpi->rate_correction_factor;
+ }
+ }
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_total_resolutions > 1)
+ low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1;
+#endif
return 1;
- } else {
- cpi->force_maxqp = 0;
- return 0;
}
cpi->force_maxqp = 0;
+ cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_total_resolutions > 1)
+ low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
return 0;
}
cpi->force_maxqp = 0;
+ cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_total_resolutions > 1)
+ low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
return 0;
}
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index 3792b10f8..e210b4410 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -16,12 +16,14 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
+#include "encodeframe.h"
#include "tokenize.h"
#include "treewriter.h"
#include "onyx_int.h"
#include "modecosts.h"
#include "encodeintra.h"
#include "pickinter.h"
+#include "vp8/common/common.h"
#include "vp8/common/entropymode.h"
#include "vp8/common/reconinter.h"
#include "vp8/common/reconintra.h"
@@ -852,8 +854,7 @@ static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label,
default: break;
}
- if (m == ABOVE4X4) /* replace above with left if same */
- {
+ if (m == ABOVE4X4) { /* replace above with left if same */
int_mv left_mv;
left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i);
@@ -959,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
vp8_variance_fn_ptr_t *v_fn_ptr;
ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta;
- ENTROPY_CONTEXT *tl;
ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
- ENTROPY_CONTEXT *ta_b;
- ENTROPY_CONTEXT *tl_b;
memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
- ta_b = (ENTROPY_CONTEXT *)&t_above_b;
- tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+ vp8_zero(t_above_b);
+ vp8_zero(t_left_b);
br = 0;
bd = 0;
@@ -1151,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
mode_selected = this_mode;
best_label_rd = this_rd;
- memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
- memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES));
}
} /*for each 4x4 mode*/
- memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
- memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES));
labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
bsi->ref_mv, x->mvcost);
diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h
index 8186ff105..960bd8f1c 100644
--- a/libvpx/vp8/encoder/rdopt.h
+++ b/libvpx/vp8/encoder/rdopt.h
@@ -19,6 +19,9 @@ extern "C" {
#define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D))
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+void vp8_auto_select_speed(VP8_COMP *cpi);
+
static INLINE void insertsortmv(int arr[], int len) {
int i, j, k;
diff --git a/libvpx/vp8/encoder/temporal_filter.c b/libvpx/vp8/encoder/temporal_filter.c
index 1b2f46bb6..0a7d25fb0 100644
--- a/libvpx/vp8/encoder/temporal_filter.c
+++ b/libvpx/vp8/encoder/temporal_filter.c
@@ -20,6 +20,7 @@
#include "ratectrl.h"
#include "vp8/common/quant_common.h"
#include "segmentation.h"
+#include "temporal_filter.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/swapyv12buffer.h"
#include "vp8/common/threading.h"
diff --git a/libvpx/vp8/encoder/temporal_filter.h b/libvpx/vp8/encoder/temporal_filter.h
new file mode 100644
index 000000000..865d909fb
--- /dev/null
+++ b/libvpx/vp8/encoder/temporal_filter.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_
+#define VP8_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VP8_ENCODER_TEMPORAL_FILTER_H_
diff --git a/libvpx/vp8/encoder/x86/dct_sse2.asm b/libvpx/vp8/encoder/x86/dct_sse2.asm
index d06bca592..4d92f0341 100644
--- a/libvpx/vp8/encoder/x86/dct_sse2.asm
+++ b/libvpx/vp8/encoder/x86/dct_sse2.asm
@@ -60,6 +60,8 @@
ret
%endmacro
+SECTION .text
+
;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
global sym(vp8_short_fdct4x4_sse2) PRIVATE
sym(vp8_short_fdct4x4_sse2):
diff --git a/libvpx/vp8/encoder/x86/encodeopt.asm b/libvpx/vp8/encoder/x86/encodeopt.asm
index 0297220ee..f6c6aeae7 100644
--- a/libvpx/vp8/encoder/x86/encodeopt.asm
+++ b/libvpx/vp8/encoder/x86/encodeopt.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr)
global sym(vp8_block_error_sse2) PRIVATE
sym(vp8_block_error_sse2):
diff --git a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
index f4989279f..b5d5de4a5 100644
--- a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
global sym(vp8_short_walsh4x4_sse2) PRIVATE
sym(vp8_short_walsh4x4_sse2):
diff --git a/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libvpx/vp8/encoder/x86/quantize_mmx.asm
deleted file mode 100644
index 2864ce16d..000000000
--- a/libvpx/vp8/encoder/x86/quantize_mmx.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-; short *qcoeff_ptr,short *dequant_ptr,
-; short *scan_mask, short *round_ptr,
-; short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp8_fast_quantize_b_impl_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- movq mm0, [rsi]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm1, [rax]
-
- movq mm3, mm0
- psraw mm0, 15
-
- pxor mm3, mm0
- psubw mm3, mm0 ; abs
-
- movq mm2, mm3
- pcmpgtw mm1, mm2
-
- pandn mm1, mm2
- movq mm3, mm1
-
- mov rdx, arg(6) ;quant_ptr
- movq mm1, [rdx]
-
- mov rcx, arg(5) ;round_ptr
- movq mm2, [rcx]
-
- paddw mm3, mm2
- pmulhuw mm3, mm1
-
- pxor mm3, mm0
- psubw mm3, mm0 ;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
- movq mm0, mm3
-
- movq [rdi], mm3
-
- mov rax, arg(3) ;dequant_ptr
- movq mm2, [rax]
-
- pmullw mm3, mm2
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax], mm3
-
- ; next 8
- movq mm4, [rsi+8]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+8]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+8]
- movq mm6, [rcx+8]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+8], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+8]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+8], mm7
-
-
- ; next 8
- movq mm4, [rsi+16]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+16]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+16]
- movq mm6, [rcx+16]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+16], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+16]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+16], mm7
-
-
- ; next 8
- movq mm4, [rsi+24]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+24]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+24]
- movq mm6, [rcx+24]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+24], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+24]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+24], mm7
-
-
-
- mov rdi, arg(4) ;scan_mask
- mov rsi, arg(2) ;qcoeff_ptr
-
- pxor mm5, mm5
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rsi+8]
-
- movq mm2, [rdi]
- movq mm3, [rdi+8];
-
- pcmpeqw mm0, mm7
- pcmpeqw mm1, mm7
-
- pcmpeqw mm6, mm6
- pxor mm0, mm6
-
- pxor mm1, mm6
- psrlw mm0, 15
-
- psrlw mm1, 15
- pmaddwd mm0, mm2
-
- pmaddwd mm1, mm3
- movq mm5, mm0
-
- paddd mm5, mm1
-
- movq mm0, [rsi+16]
- movq mm1, [rsi+24]
-
- movq mm2, [rdi+16]
- movq mm3, [rdi+24];
-
- pcmpeqw mm0, mm7
- pcmpeqw mm1, mm7
-
- pcmpeqw mm6, mm6
- pxor mm0, mm6
-
- pxor mm1, mm6
- psrlw mm0, 15
-
- psrlw mm1, 15
- pmaddwd mm0, mm2
-
- pmaddwd mm1, mm3
- paddd mm5, mm0
-
- paddd mm5, mm1
- movq mm0, mm5
-
- psrlq mm5, 32
- paddd mm0, mm5
-
- ; eob adjustment begins here
- movq rcx, mm0
- and rcx, 0xffff
-
- xor rdx, rdx
- sub rdx, rcx ; rdx=-rcx
-
- bsr rax, rcx
- inc rax
-
- sar rdx, 31
- and rax, rdx
- ; Substitute the sse assembly for the old mmx mixed assembly/C. The
- ; following is kept as reference
- ; movq rcx, mm0
- ; bsr rax, rcx
- ;
- ; mov eob, rax
- ; mov eee, rcx
- ;
- ;if(eee==0)
- ;{
- ; eob=-1;
- ;}
- ;else if(eee<0)
- ;{
- ; eob=15;
- ;}
- ;d->eob = eob+1;
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index bd92b398a..d2b4711b8 100644
--- a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
; void vp8_temporal_filter_apply_sse2 | arg
; (unsigned char *frame1, | 0
; unsigned int stride, | 1
@@ -203,5 +205,5 @@ align 16
_const_top_bit:
times 8 dw 1<<15
align 16
-_const_16w
+_const_16w:
times 8 dw 16
diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
deleted file mode 100644
index 4406dd0cc..000000000
--- a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/block.h"
-
-int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
- short *qcoeff_ptr, short *dequant_ptr,
- const short *scan_mask, short *round_ptr,
- short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
- const short *scan_mask = vp8_default_zig_zag_mask;
- short *coeff_ptr = b->coeff;
- short *zbin_ptr = b->zbin;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant_fast;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
-
- *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
- coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask,
-
- round_ptr, quant_ptr, dqcoeff_ptr);
-}
diff --git a/libvpx/vp8/encoder/x86/quantize_ssse3.c b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
index 322f0a151..d54745015 100644
--- a/libvpx/vp8/encoder/x86/quantize_ssse3.c
+++ b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -10,6 +10,7 @@
#include <tmmintrin.h> /* SSSE3 */
+#include "./vp8_rtcd.h"
#include "vp8/encoder/block.h"
/* bitscan reverse (bsr) */
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
index 137f5bb62..246fe6a67 100644
--- a/libvpx/vp8/vp8_common.mk
+++ b/libvpx/vp8/vp8_common.mk
@@ -116,6 +116,14 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c
+
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
endif
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index f8475ed61..af6689fd9 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -1216,6 +1216,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
50, /* rc_two_pass_vbrbias */
0, /* rc_two_pass_vbrmin_section */
400, /* rc_two_pass_vbrmax_section */
+ 0, // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9)
/* keyframing settings (kf) */
VPX_KF_AUTO, /* g_kfmode*/
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index 9ea9c7f04..f20283c1e 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -144,8 +144,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
}
si->is_kf = 0;
- if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */
- {
+ if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */
si->is_kf = 1;
/* vet via sync code */
@@ -228,7 +227,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
}
static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
- unsigned int data_sz, vpx_codec_err_t *res) {
+ unsigned int data_sz,
+ volatile vpx_codec_err_t *res) {
*res = VPX_CODEC_OK;
if (ctx->fragments.count == 0) {
@@ -267,7 +267,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
const uint8_t *data, unsigned int data_sz,
void *user_priv, long deadline) {
- vpx_codec_err_t res = VPX_CODEC_OK;
+ volatile vpx_codec_err_t res;
unsigned int resolution_change = 0;
unsigned int w, h;
@@ -414,7 +414,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
#endif
#if CONFIG_MULTITHREAD
- if (pbi->b_multithreaded_rd) {
+ if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
}
#else
@@ -580,7 +580,6 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
}
}
-extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame);
static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *ref_info = va_arg(args, int *);
diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk
index 7bd41a3fb..0dac0169d 100644
--- a/libvpx/vp8/vp8cx.mk
+++ b/libvpx/vp8/vp8cx.mk
@@ -30,6 +30,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c
VP8_CX_SRCS-yes += encoder/encodemb.c
VP8_CX_SRCS-yes += encoder/encodemv.c
VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h
VP8_CX_SRCS-yes += encoder/firstpass.c
VP8_CX_SRCS-yes += encoder/block.h
VP8_CX_SRCS-yes += encoder/boolhuff.h
@@ -56,11 +57,14 @@ VP8_CX_SRCS-yes += encoder/modecosts.c
VP8_CX_SRCS-yes += encoder/onyx_if.c
VP8_CX_SRCS-yes += encoder/pickinter.c
VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/picklpf.h
VP8_CX_SRCS-yes += encoder/vp8_quantize.c
VP8_CX_SRCS-yes += encoder/ratectrl.c
VP8_CX_SRCS-yes += encoder/rdopt.c
VP8_CX_SRCS-yes += encoder/segmentation.c
VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-yes += common/vp8_skin_detection.c
+VP8_CX_SRCS-yes += common/vp8_skin_detection.h
VP8_CX_SRCS-yes += encoder/tokenize.c
VP8_CX_SRCS-yes += encoder/dct_value_cost.h
VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
@@ -68,19 +72,20 @@ VP8_CX_SRCS-yes += encoder/treewriter.c
VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.h
VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
ifeq ($(CONFIG_REALTIME_ONLY),yes)
VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
endif
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c
VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c
ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
@@ -89,7 +94,6 @@ endif
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
ifeq ($(CONFIG_REALTIME_ONLY),yes)
@@ -106,6 +110,9 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c
+
ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
endif
diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index dd1ea03b6..025254c3f 100644
--- a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,14 +14,7 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
-
-static int16_t sinpi_1_9 = 0x14a3;
-static int16_t sinpi_2_9 = 0x26c9;
-static int16_t sinpi_3_9 = 0x3441;
-static int16_t sinpi_4_9 = 0x3b6c;
-static int16_t cospi_8_64 = 0x3b21;
-static int16_t cospi_16_64 = 0x2d41;
-static int16_t cospi_24_64 = 0x187e;
+#include "vpx_dsp/txfm_common.h"
static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
int32x4_t q8s32, q9s32;
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 66aa733b9..7345e259b 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -17,24 +17,6 @@
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_onyxc_int.h"
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
- pthread_mutex_lock(&pool->pool_mutex);
-#else
- (void)pool;
-#endif
-}
-
-void unlock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
- pthread_mutex_unlock(&pool->pool_mutex);
-#else
- (void)pool;
-#endif
-}
-
void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -62,8 +44,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
cm->prev_seg_map_idx = 1;
cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
- if (!cm->frame_parallel_decode)
- cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+ cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
return 0;
}
@@ -77,20 +58,18 @@ static void free_seg_map(VP9_COMMON *cm) {
}
cm->current_frame_seg_map = NULL;
-
- if (!cm->frame_parallel_decode) {
- cm->last_frame_seg_map = NULL;
- }
+ cm->last_frame_seg_map = NULL;
}
void vp9_free_ref_frame_buffers(BufferPool *pool) {
int i;
for (i = 0; i < FRAME_BUFFERS; ++i) {
- if (pool->frame_bufs[i].ref_count > 0 &&
+ if (!pool->frame_bufs[i].released &&
pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
pool->frame_bufs[i].ref_count = 0;
+ pool->frame_bufs[i].released = 1;
}
vpx_free(pool->frame_bufs[i].mvs);
pool->frame_bufs[i].mvs = NULL;
@@ -176,6 +155,9 @@ fail:
}
void vp9_remove_common(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+ vp9_free_postproc_buffers(cm);
+#endif
vp9_free_context_buffers(cm);
vpx_free(cm->fc);
@@ -186,7 +168,7 @@ void vp9_remove_common(VP9_COMMON *cm) {
void vp9_init_context_buffers(VP9_COMMON *cm) {
cm->setup_mi(cm);
- if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+ if (cm->last_frame_seg_map)
memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
}
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index bcb9e8f29..47cd63e94 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -428,7 +428,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
vp9_clearall_segfeatures(&cm->seg);
cm->seg.abs_delta = SEGMENT_DELTADATA;
- if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+ if (cm->last_frame_seg_map)
memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
if (cm->current_frame_seg_map)
@@ -457,7 +457,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
}
// prev_mip will only be allocated in encoder.
- if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+ if (frame_is_intra_only(cm) && cm->prev_mip)
memset(cm->prev_mip, 0,
cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index ef0297dd5..c7c343aed 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -1612,12 +1612,14 @@ void vp9_loop_filter_data_reset(
void vp9_reset_lfm(VP9_COMMON *const cm) {
if (cm->lf.filter_level) {
- memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) *
- cm->lf.lfm_stride * sizeof(*cm->lf.lfm));
+ memset(cm->lf.lfm, 0,
+ ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
+ sizeof(*cm->lf.lfm));
}
}
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+int vp9_loop_filter_worker(void *arg1, void *unused) {
+ LFWorkerData *const lf_data = (LFWorkerData *)arg1;
(void)unused;
loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only);
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index da37a6ebd..481a6cdc6 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -151,8 +151,8 @@ void vp9_loop_filter_data_reset(
LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
-// Operates on the rows described by 'lf_data'.
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+// Operates on the rows described by 'arg1' (cast to LFWorkerData *).
+int vp9_loop_filter_worker(void *arg1, void *unused);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 32db7b7aa..1d96d92c2 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -37,13 +37,10 @@ extern "C" {
#define REF_FRAMES_LOG2 3
#define REF_FRAMES (1 << REF_FRAMES_LOG2)
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
+// 1 scratch frame for the new frame, 3 for scaled references on the encoder.
// TODO(jkoleszar): These 3 extra references could probably come from the
// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
+#define FRAME_BUFFERS (REF_FRAMES + 4)
#define FRAME_CONTEXTS_LOG2 2
#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@@ -72,30 +69,12 @@ typedef struct {
MV_REF *mvs;
int mi_rows;
int mi_cols;
+ uint8_t released;
vpx_codec_frame_buffer_t raw_frame_buffer;
YV12_BUFFER_CONFIG buf;
-
- // The Following variables will only be used in frame parallel decode.
-
- // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
- // that no FrameWorker owns, or is decoding, this buffer.
- VPxWorker *frame_worker_owner;
-
- // row and col indicate which position frame has been decoded to in real
- // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
- // when the frame is fully decoded.
- int row;
- int col;
} RefCntBuffer;
typedef struct BufferPool {
-// Protect BufferPool from being accessed by several FrameWorkers at
-// the same time during frame parallel decode.
-// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
- pthread_mutex_t pool_mutex;
-#endif
-
// Private data associated with the frame buffer callbacks.
void *cb_priv;
@@ -235,10 +214,6 @@ typedef struct VP9Common {
struct loopfilter lf;
struct segmentation seg;
- // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
- // in pbi.
- int frame_parallel_decode; // frame-based threading.
-
// Context probabilities for reference frame prediction
MV_REFERENCE_FRAME comp_fixed_ref;
MV_REFERENCE_FRAME comp_var_ref[2];
@@ -283,11 +258,6 @@ typedef struct VP9Common {
int above_context_alloc_cols;
} VP9_COMMON;
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool);
-void unlock_buffer_pool(BufferPool *const pool);
-
static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
if (index < 0 || index >= REF_FRAMES) return NULL;
if (cm->ref_frame_map[index] < 0) return NULL;
@@ -303,7 +273,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
int i;
- lock_buffer_pool(cm->buffer_pool);
for (i = 0; i < FRAME_BUFFERS; ++i)
if (frame_bufs[i].ref_count == 0) break;
@@ -314,7 +283,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
i = INVALID_IDX;
}
- unlock_buffer_pool(cm->buffer_pool);
return i;
}
@@ -342,7 +310,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm,
xd->partition_probs =
frame_is_intra_only(cm)
? &vp9_kf_partition_probs[0]
- : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+ : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob;
}
static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index b105e5d45..dfc315eea 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
// if mfqe is enabled. Need to take both the quality and the speed
// into consideration.
if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
- vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+ vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
}
if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
@@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
cm->postproc_state.limits);
} else {
- vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+ vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
}
} else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
@@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
} else if (flags & VP9D_DEBLOCK) {
vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
} else {
- vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+ vpx_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
ppstate->last_base_qindex = cm->base_qindex;
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 1b09b380d..bb9291a26 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
const struct scale_factors *sf, int w, int h,
int ref, const InterpKernel *kernel, int xs,
int ys) {
- sf->predict[subpel_x != 0][subpel_y != 0][ref](
- src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
- ys, w, h);
+ sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,
+ dst_stride, kernel, subpel_x,
+ xs, subpel_y, ys, w, h);
}
#if CONFIG_VP9_HIGHBITDEPTH
@@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor(
const int subpel_x, const int subpel_y, const struct scale_factors *sf,
int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
- src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
- ys, w, h, bd);
+ src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,
+ h, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl
index baf63e97f..22b67ecac 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
sub vp9_common_forward_decls() {
print <<EOF
/*
@@ -30,6 +40,7 @@ if ($opts{arch} eq "x86_64") {
$ssse3_x86_64 = 'ssse3';
$avx_x86_64 = 'avx';
$avx2_x86_64 = 'avx2';
+ $avx512_x86_64 = 'avx512';
}
#
@@ -46,41 +57,24 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
#
# dct
#
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- } else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht4x4_16_add sse2/;
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht8x8_64_add sse2/;
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/vp9_iht16x16_256_add sse2/;
- }
-} else {
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- } else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
+# Force C versions if CONFIG_EMULATE_HARDWARE is 1
+add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+
+add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+
+add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+ # Note that there are more specializations appended when
+ # CONFIG_VP9_HIGHBITDEPTH is off.
+ specialize qw/vp9_iht4x4_16_add sse2/;
+ specialize qw/vp9_iht8x8_64_add sse2/;
+ specialize qw/vp9_iht16x16_256_add sse2/;
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+ # Note that these specializations are appended to the above ones.
+ specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
+ specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+ specialize qw/vp9_iht16x16_256_add dspr2 msa/;
}
}
@@ -124,82 +118,69 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
specialize qw/vp9_denoiser_filter neon sse2/;
}
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/vp9_block_error avx2 sse2/;
+add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/vp9_highbd_block_error sse2/;
+add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
- add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
- specialize qw/vp9_block_error_fp sse2/;
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ specialize qw/vp9_block_error avx2 sse2/;
+
+ specialize qw/vp9_block_error_fp avx2 sse2/;
- add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant neon ssse3/;
+
+ add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/vp9_highbd_block_error sse2/;
} else {
- add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2 msa sse2/;
- add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
- specialize qw/vp9_block_error_fp neon sse2/;
-
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+ specialize qw/vp9_block_error_fp neon avx2 sse2/;
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
-
- add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
}
# fdct functions
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht4x4 sse2/;
-
- add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht8x8 sse2/;
+add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht16x16 sse2/;
+add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4 sse2/;
-} else {
- add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht4x4 sse2 msa/;
+add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht8x8 sse2 msa/;
+add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht16x16 sse2 msa/;
-
- add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4 msa sse2/;
+# Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
+# is off.
+specialize qw/vp9_fht4x4 sse2/;
+specialize qw/vp9_fht8x8 sse2/;
+specialize qw/vp9_fht16x16 sse2/;
+specialize qw/vp9_fwht4x4 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+ # Note that these specializations are appended to the above ones.
+ specialize qw/vp9_fht4x4 msa/;
+ specialize qw/vp9_fht8x8 msa/;
+ specialize qw/vp9_fht16x16 msa/;
+ specialize qw/vp9_fwht4x4 msa/;
}
#
# Motion search
#
-add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
-specialize qw/vp9_full_search_sad sse3 sse4_1/;
-$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
-$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
-
add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
specialize qw/vp9_diamond_search_sad avx/;
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse4_1/;
+}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -227,7 +208,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# frame based scale
#
add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler";
-specialize qw/vp9_scale_and_extend_frame ssse3/;
+specialize qw/vp9_scale_and_extend_frame neon ssse3/;
}
# end encoder functions
diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c
index 07e659d23..8d44e91f2 100644
--- a/libvpx/vp9/common/vp9_thread_common.c
+++ b/libvpx/vp9/common/vp9_thread_common.c
@@ -140,8 +140,9 @@ static INLINE void thread_loop_filter_rows(
}
// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(VP9LfSync *const lf_sync,
- LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+ VP9LfSync *const lf_sync = (VP9LfSync *)arg1;
+ LFWorkerData *const lf_data = (LFWorkerData *)arg2;
thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only,
lf_sync);
@@ -183,7 +184,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
VPxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
- worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+ worker->hook = loop_filter_row_worker;
worker->data1 = lf_sync;
worker->data2 = lf_data;
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index bb2dcf52b..6996260e2 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
__m128i in[2];
const __m128i eight = _mm_set1_epi16(8);
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
@@ -54,18 +54,17 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[8];
- const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
- in[4] = load_input_data(input + 8 * 4);
- in[5] = load_input_data(input + 8 * 5);
- in[6] = load_input_data(input + 8 * 6);
- in[7] = load_input_data(input + 8 * 7);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8 * 1);
+ in[2] = load_input_data8(input + 8 * 2);
+ in[3] = load_input_data8(input + 8 * 3);
+ in[4] = load_input_data8(input + 8 * 4);
+ in[5] = load_input_data8(input + 8 * 5);
+ in[6] = load_input_data8(input + 8 * 6);
+ in[7] = load_input_data8(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT
@@ -106,14 +105,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[6] = _mm_srai_epi16(in[6], 5);
in[7] = _mm_srai_epi16(in[7], 5);
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
+ recon_and_store(dest + 0 * stride, in[0]);
+ recon_and_store(dest + 1 * stride, in[1]);
+ recon_and_store(dest + 2 * stride, in[2]);
+ recon_and_store(dest + 3 * stride, in[3]);
+ recon_and_store(dest + 4 * stride, in[4]);
+ recon_and_store(dest + 5 * stride, in[5]);
+ recon_and_store(dest + 6 * stride, in[6]);
+ recon_and_store(dest + 7 * stride, in[7]);
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
+
+ in[8] = load_input_data8(input + 8 * 16);
+ in[9] = load_input_data8(input + 9 * 16);
+ in[10] = load_input_data8(input + 10 * 16);
+ in[11] = load_input_data8(input + 11 * 16);
+ in[12] = load_input_data8(input + 12 * 16);
+ in[13] = load_input_data8(input + 13 * 16);
+ in[14] = load_input_data8(input + 14 * 16);
+ in[15] = load_input_data8(input + 15 * 16);
+}
+
+static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
+ const int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ recon_and_store(dest + 0 * stride, in[0]);
+ recon_and_store(dest + 1 * stride, in[1]);
+ recon_and_store(dest + 2 * stride, in[2]);
+ recon_and_store(dest + 3 * stride, in[3]);
+ recon_and_store(dest + 4 * stride, in[4]);
+ recon_and_store(dest + 5 * stride, in[5]);
+ recon_and_store(dest + 6 * stride, in[6]);
+ recon_and_store(dest + 7 * stride, in[7]);
+ recon_and_store(dest + 8 * stride, in[8]);
+ recon_and_store(dest + 9 * stride, in[9]);
+ recon_and_store(dest + 10 * stride, in[10]);
+ recon_and_store(dest + 11 * stride, in[11]);
+ recon_and_store(dest + 12 * stride, in[12]);
+ recon_and_store(dest + 13 * stride, in[13]);
+ recon_and_store(dest + 14 * stride, in[14]);
+ recon_and_store(dest + 15 * stride, in[15]);
}
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
diff --git a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
index 30852049b..ca0897ab9 100644
--- a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
+++ b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -12,6 +12,8 @@
; TODO(jackychen): Find a way to fix the duplicate.
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;void vp9_filter_by_weight16x16_sse2
;(
; unsigned char *src,
diff --git a/libvpx/vp9/decoder/vp9_decodeframe.c b/libvpx/vp9/decoder/vp9_decodeframe.c
index 0760f8c23..d0e896c13 100644
--- a/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -490,8 +490,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
#endif // CONFIG_VP9_HIGHBITDEPTH
static void dec_build_inter_predictors(
- VPxWorker *const worker, MACROBLOCKD *xd, int plane, int bw, int bh, int x,
- int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel,
+ MACROBLOCKD *xd, int plane, int bw, int bh, int x, int y, int w, int h,
+ int mi_x, int mi_y, const InterpKernel *kernel,
const struct scale_factors *sf, struct buf_2d *pre_buf,
struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf,
int is_scaled, int ref) {
@@ -593,12 +593,6 @@ static void dec_build_inter_predictors(
y_pad = 1;
}
- // Wait until reference block is ready. Pad 7 more pixels as last 7
- // pixels of each superblock row can be changed by next superblock row.
- if (worker != NULL)
- vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
- << (plane == 0 ? 0 : 1));
-
// Skip border extension if block is inside the frame.
if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
@@ -617,14 +611,6 @@ static void dec_build_inter_predictors(
w, h, ref, xs, ys);
return;
}
- } else {
- // Wait until reference block is ready. Pad 7 more pixels as last 7
- // pixels of each superblock row can be changed by next superblock row.
- if (worker != NULL) {
- const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
- vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
- << (plane == 0 ? 0 : 1));
- }
}
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -653,8 +639,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
const int is_compound = has_second_ref(mi);
int ref;
int is_scaled;
- VPxWorker *const fwo =
- pbi->frame_parallel_decode ? pbi->frame_worker_owner : NULL;
for (ref = 0; ref < 1 + is_compound; ++ref) {
const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
@@ -686,10 +670,10 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
for (y = 0; y < num_4x4_h; ++y) {
for (x = 0; x < num_4x4_w; ++x) {
const MV mv = average_split_mvs(pd, mi, ref, i++);
- dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 4 * x,
- 4 * y, 4, 4, mi_x, mi_y, kernel, sf,
- pre_buf, dst_buf, &mv, ref_frame_buf,
- is_scaled, ref);
+ dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 4 * x, 4 * y,
+ 4, 4, mi_x, mi_y, kernel, sf, pre_buf,
+ dst_buf, &mv, ref_frame_buf, is_scaled,
+ ref);
}
}
}
@@ -703,7 +687,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
const int n4w_x4 = 4 * num_4x4_w;
const int n4h_x4 = 4 * num_4x4_h;
struct buf_2d *const pre_buf = &pd->pre[ref];
- dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
+ dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
n4h_x4, mi_x, mi_y, kernel, sf, pre_buf,
dst_buf, &mv, ref_frame_buf, is_scaled, ref);
}
@@ -1187,7 +1171,6 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
resize_context_buffers(cm, width, height);
setup_render_size(cm, rb);
- lock_buffer_pool(pool);
if (vpx_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
cm->subsampling_y,
@@ -1197,12 +1180,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
&pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
pool->cb_priv)) {
- unlock_buffer_pool(pool);
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
- unlock_buffer_pool(pool);
+ pool->frame_bufs[cm->new_fb_idx].released = 0;
pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1273,7 +1255,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
resize_context_buffers(cm, width, height);
setup_render_size(cm, rb);
- lock_buffer_pool(pool);
if (vpx_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
cm->subsampling_y,
@@ -1283,12 +1264,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
&pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
pool->cb_priv)) {
- unlock_buffer_pool(pool);
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
- unlock_buffer_pool(pool);
+ pool->frame_bufs[cm->new_fb_idx].released = 0;
pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1384,7 +1364,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
pbi->lf_worker.data1 == NULL) {
CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
vpx_memalign(32, sizeof(LFWorkerData)));
- pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker;
+ pbi->lf_worker.hook = vp9_loop_filter_worker;
if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Loop filter thread creation failed");
@@ -1473,11 +1453,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
winterface->execute(&pbi->lf_worker);
}
}
- // After loopfiltering, the last 7 row pixels in each superblock row may
- // still be changed by the longest loopfilter of the next superblock
- // row.
- if (pbi->frame_parallel_decode)
- vp9_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2);
}
}
@@ -1493,16 +1468,16 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
// Get last tile data.
tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1;
- if (pbi->frame_parallel_decode)
- vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
return vpx_reader_find_end(&tile_data->bit_reader);
}
// On entry 'tile_data->data_end' points to the end of the input frame, on exit
// it is updated to reflect the bitreader position of the final tile column if
// present in the tile buffer group or NULL otherwise.
-static int tile_worker_hook(TileWorkerData *const tile_data,
- VP9Decoder *const pbi) {
+static int tile_worker_hook(void *arg1, void *arg2) {
+ TileWorkerData *const tile_data = (TileWorkerData *)arg1;
+ VP9Decoder *const pbi = (VP9Decoder *)arg2;
+
TileInfo *volatile tile = &tile_data->xd.tile;
const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
const uint8_t *volatile bit_reader_end = NULL;
@@ -1596,7 +1571,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
tile_data->xd = pbi->mb;
tile_data->xd.counts =
cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
- worker->hook = (VPxWorkerHook)tile_worker_hook;
+ worker->hook = tile_worker_hook;
worker->data1 = tile_data;
worker->data2 = pbi;
}
@@ -1779,24 +1754,17 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
if (cm->show_existing_frame) {
// Show an existing frame directly.
const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
- lock_buffer_pool(pool);
if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
- unlock_buffer_pool(pool);
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Buffer %d does not contain a decoded frame",
frame_to_show);
}
ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
- unlock_buffer_pool(pool);
pbi->refresh_frame_flags = 0;
cm->lf.filter_level = 0;
cm->show_frame = 1;
- if (pbi->frame_parallel_decode) {
- for (i = 0; i < REF_FRAMES; ++i)
- cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
- }
return 0;
}
@@ -1913,7 +1881,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
// Generate next_ref_frame_map.
- lock_buffer_pool(pool);
for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
if (mask & 1) {
cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
@@ -1933,7 +1900,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
if (cm->ref_frame_map[ref_index] >= 0)
++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
}
- unlock_buffer_pool(pool);
pbi->hold_ref_buf = 1;
if (frame_is_intra_only(cm) || cm->error_resilient_mode)
@@ -2090,24 +2056,6 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
- // If encoded in frame parallel mode, frame context is ready after decoding
- // the frame header.
- if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
- VPxWorker *const worker = pbi->frame_worker_owner;
- FrameWorkerData *const frame_worker_data = worker->data1;
- if (cm->refresh_frame_context) {
- context_updated = 1;
- cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
- }
- vp9_frameworker_lock_stats(worker);
- pbi->cur_buf->row = -1;
- pbi->cur_buf->col = -1;
- frame_worker_data->frame_context_ready = 1;
- // Signal the main thread that context is ready.
- vp9_frameworker_signal_stats(worker);
- vp9_frameworker_unlock_stats(worker);
- }
-
if (pbi->tile_worker_data == NULL ||
(tile_cols * tile_rows) != pbi->total_tiles) {
const int num_tile_workers =
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 1a4152436..0a781413b 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -455,12 +455,6 @@ static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
}
}
-static void fpm_sync(void *const data, int mi_row) {
- VP9Decoder *const pbi = (VP9Decoder *)data;
- vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
- mi_row << MI_BLOCK_SIZE_LOG2);
-}
-
// This macro is used to add a motion vector mv_ref list if it isn't
// already in the list. If it's the second motion vector or early_break
// it will also skip all additional processing and jump to Done!
@@ -500,8 +494,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
const POSITION *const mv_ref_search,
int_mv *mv_ref_list, int mi_row, int mi_col,
- int block, int is_sub8x8, find_mv_refs_sync sync,
- void *const data) {
+ int block, int is_sub8x8) {
const int *ref_sign_bias = cm->ref_frame_sign_bias;
int i, refmv_count = 0;
int different_ref_found = 0;
@@ -557,23 +550,8 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
}
}
-// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-// on windows platform. The sync here is unnecessary if use_prev_frame_mvs
-// is 0. But after removing it, there will be hang in the unit test on windows
-// due to several threads waiting for a thread's signal.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
- if (cm->frame_parallel_decode && sync != NULL) {
- sync(data, mi_row);
- }
-#endif
-
// Check the last frame's mode and mv info.
if (prev_frame_mvs) {
- // Synchronize here for frame parallel decode if sync function is provided.
- if (cm->frame_parallel_decode && sync != NULL) {
- sync(data, mi_row);
- }
-
if (prev_frame_mvs->ref_frame[0] == ref_frame) {
ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
} else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
@@ -652,7 +630,7 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
refmv_count =
dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search,
- mv_list, mi_row, mi_col, block, 1, NULL, NULL);
+ mv_list, mi_row, mi_col, block, 1);
switch (block) {
case 0: best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; break;
@@ -750,9 +728,8 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
int refmv_count;
- refmv_count =
- dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs,
- mi_row, mi_col, -1, 0, fpm_sync, (void *)pbi);
+ refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search,
+ tmp_mvs, mi_row, mi_col, -1, 0);
dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
refmv_count);
diff --git a/libvpx/vp9/decoder/vp9_decoder.c b/libvpx/vp9/decoder/vp9_decoder.c
index 37693f094..a913fa560 100644
--- a/libvpx/vp9/decoder/vp9_decoder.c
+++ b/libvpx/vp9/decoder/vp9_decoder.c
@@ -139,6 +139,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
vp9_loop_filter_dealloc(&pbi->lf_row_sync);
}
+ vp9_remove_common(&pbi->common);
vpx_free(pbi);
}
@@ -169,7 +170,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Incorrect buffer dimensions");
else
- vp8_yv12_copy_frame(cfg, sd);
+ vpx_yv12_copy_frame(cfg, sd);
} else {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame");
}
@@ -217,7 +218,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
"Incorrect buffer dimensions");
} else {
// Overwrite the reference frame buffer.
- vp8_yv12_copy_frame(sd, ref_buf);
+ vpx_yv12_copy_frame(sd, ref_buf);
}
return cm->error.error_code;
@@ -230,7 +231,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
BufferPool *const pool = cm->buffer_pool;
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
- lock_buffer_pool(pool);
for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
const int old_idx = cm->ref_frame_map[ref_index];
// Current thread releases the holding of reference frame.
@@ -250,15 +250,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
decrease_ref_count(old_idx, frame_bufs, pool);
cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
}
- unlock_buffer_pool(pool);
pbi->hold_ref_buf = 0;
cm->frame_to_show = get_frame_new_buffer(cm);
- if (!pbi->frame_parallel_decode || !cm->show_frame) {
- lock_buffer_pool(pool);
- --frame_bufs[cm->new_fb_idx].ref_count;
- unlock_buffer_pool(pool);
- }
+ --frame_bufs[cm->new_fb_idx].ref_count;
// Invalidate these references until the next frame starts.
for (ref_index = 0; ref_index < 3; ref_index++)
@@ -292,11 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
pbi->ready_for_new_data = 0;
// Check if the previous frame was a frame without any references to it.
- // Release frame buffer if not decoding in frame parallel mode.
- if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 &&
- frame_bufs[cm->new_fb_idx].ref_count == 0)
+ if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 &&
+ !frame_bufs[cm->new_fb_idx].released) {
pool->release_fb_cb(pool->cb_priv,
&frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+ frame_bufs[cm->new_fb_idx].released = 1;
+ }
+
// Find a free frame buffer. Return error if can not find any.
cm->new_fb_idx = get_free_fb(cm);
if (cm->new_fb_idx == INVALID_IDX) {
@@ -309,18 +306,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
pbi->hold_ref_buf = 0;
- if (pbi->frame_parallel_decode) {
- VPxWorker *const worker = pbi->frame_worker_owner;
- vp9_frameworker_lock_stats(worker);
- frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
- // Reset decoding progress.
- pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
- pbi->cur_buf->row = -1;
- pbi->cur_buf->col = -1;
- vp9_frameworker_unlock_stats(worker);
- } else {
- pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
- }
+ pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
if (setjmp(cm->error.jmp)) {
const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
@@ -336,7 +322,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
winterface->sync(&pbi->tile_workers[i]);
}
- lock_buffer_pool(pool);
// Release all the reference buffers if worker thread is holding them.
if (pbi->hold_ref_buf == 1) {
int ref_index = 0, mask;
@@ -361,7 +346,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
}
// Release current frame.
decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
- unlock_buffer_pool(pool);
vpx_clear_system_state();
return -1;
@@ -377,31 +361,14 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
if (!cm->show_existing_frame) {
cm->last_show_frame = cm->show_frame;
cm->prev_frame = cm->cur_frame;
- if (cm->seg.enabled && !pbi->frame_parallel_decode)
- vp9_swap_current_and_last_seg_map(cm);
+ if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm);
}
// Update progress in frame parallel decode.
- if (pbi->frame_parallel_decode) {
- // Need to lock the mutex here as another thread may
- // be accessing this buffer.
- VPxWorker *const worker = pbi->frame_worker_owner;
- FrameWorkerData *const frame_worker_data = worker->data1;
- vp9_frameworker_lock_stats(worker);
-
- if (cm->show_frame) {
- cm->current_video_frame++;
- }
- frame_worker_data->frame_decoded = 1;
- frame_worker_data->frame_context_ready = 1;
- vp9_frameworker_signal_stats(worker);
- vp9_frameworker_unlock_stats(worker);
- } else {
- cm->last_width = cm->width;
- cm->last_height = cm->height;
- if (cm->show_frame) {
- cm->current_video_frame++;
- }
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
+ if (cm->show_frame) {
+ cm->current_video_frame++;
}
cm->error.setjmp = 0;
diff --git a/libvpx/vp9/decoder/vp9_decoder.h b/libvpx/vp9/decoder/vp9_decoder.h
index 427baf1e0..4b26c314d 100644
--- a/libvpx/vp9/decoder/vp9_decoder.h
+++ b/libvpx/vp9/decoder/vp9_decoder.h
@@ -21,7 +21,6 @@
#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_ppflags.h"
-#include "vp9/decoder/vp9_dthread.h"
#ifdef __cplusplus
extern "C" {
@@ -53,13 +52,10 @@ typedef struct VP9Decoder {
int refresh_frame_flags;
- int frame_parallel_decode; // frame-based threading.
-
// TODO(hkuang): Combine this with cur_buf in macroblockd as they are
// the same.
RefCntBuffer *cur_buf; // Current decoding frame buffer.
- VPxWorker *frame_worker_owner; // frame_worker that owns this pbi.
VPxWorker lf_worker;
VPxWorker *tile_workers;
TileWorkerData *tile_worker_data;
@@ -121,9 +117,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
// But the private buffer is not set up until finish decoding header.
// So any error happens during decoding header, the frame_bufs will not
// have valid priv buffer.
- if (frame_bufs[idx].ref_count == 0 &&
+ if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 &&
frame_bufs[idx].raw_frame_buffer.priv) {
pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+ frame_bufs[idx].released = 1;
}
}
}
diff --git a/libvpx/vp9/decoder/vp9_dthread.c b/libvpx/vp9/decoder/vp9_dthread.c
deleted file mode 100644
index 52bc2a0f6..000000000
--- a/libvpx/vp9/decoder/vp9_dthread.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_decoder.h"
-
-// #define DEBUG_THREAD
-
-// TODO(hkuang): Clean up all the #ifdef in this file.
-void vp9_frameworker_lock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
- FrameWorkerData *const worker_data = worker->data1;
- pthread_mutex_lock(&worker_data->stats_mutex);
-#else
- (void)worker;
-#endif
-}
-
-void vp9_frameworker_unlock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
- FrameWorkerData *const worker_data = worker->data1;
- pthread_mutex_unlock(&worker_data->stats_mutex);
-#else
- (void)worker;
-#endif
-}
-
-void vp9_frameworker_signal_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
- FrameWorkerData *const worker_data = worker->data1;
-
-// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
- pthread_cond_signal(&worker_data->stats_cond);
-#else
- pthread_cond_broadcast(&worker_data->stats_cond);
-#endif
-
-#else
- (void)worker;
-#endif
-}
-
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-
-// TODO(hkuang): Remove worker parameter as it is only used in debug code.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
- int row) {
-#if CONFIG_MULTITHREAD
- if (!ref_buf) return;
-
-#ifndef BUILDING_WITH_TSAN
- // The following line of code will get harmless tsan error but it is the key
- // to get best performance.
- if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
-#endif
-
- {
- // Find the worker thread that owns the reference frame. If the reference
- // frame has been fully decoded, it may not have owner.
- VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
- FrameWorkerData *const ref_worker_data =
- (FrameWorkerData *)ref_worker->data1;
- const VP9Decoder *const pbi = ref_worker_data->pbi;
-
-#ifdef DEBUG_THREAD
- {
- FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
- printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n",
- worker_data->worker_id, worker, ref_worker_data->worker_id,
- ref_buf->frame_worker_owner, row, ref_buf->row);
- }
-#endif
-
- vp9_frameworker_lock_stats(ref_worker);
- while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
- ref_buf->buf.corrupted != 1) {
- pthread_cond_wait(&ref_worker_data->stats_cond,
- &ref_worker_data->stats_mutex);
- }
-
- if (ref_buf->buf.corrupted == 1) {
- FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
- vp9_frameworker_unlock_stats(ref_worker);
- vpx_internal_error(&worker_data->pbi->common.error,
- VPX_CODEC_CORRUPT_FRAME,
- "Worker %p failed to decode frame", worker);
- }
- vp9_frameworker_unlock_stats(ref_worker);
- }
-#else
- (void)worker;
- (void)ref_buf;
- (void)row;
- (void)ref_buf;
-#endif // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
-#if CONFIG_MULTITHREAD
- VPxWorker *worker = buf->frame_worker_owner;
-
-#ifdef DEBUG_THREAD
- {
- FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
- printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
- buf->frame_worker_owner, row);
- }
-#endif
-
- vp9_frameworker_lock_stats(worker);
- buf->row = row;
- vp9_frameworker_signal_stats(worker);
- vp9_frameworker_unlock_stats(worker);
-#else
- (void)buf;
- (void)row;
-#endif // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
- VPxWorker *const src_worker) {
-#if CONFIG_MULTITHREAD
- FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
- FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
- VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
- VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
- int i;
-
- // Wait until source frame's context is ready.
- vp9_frameworker_lock_stats(src_worker);
- while (!src_worker_data->frame_context_ready) {
- pthread_cond_wait(&src_worker_data->stats_cond,
- &src_worker_data->stats_mutex);
- }
-
- dst_cm->last_frame_seg_map = src_cm->seg.enabled
- ? src_cm->current_frame_seg_map
- : src_cm->last_frame_seg_map;
- dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
- vp9_frameworker_unlock_stats(src_worker);
-
- dst_cm->bit_depth = src_cm->bit_depth;
-#if CONFIG_VP9_HIGHBITDEPTH
- dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
-#endif
- dst_cm->prev_frame =
- src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
- dst_cm->last_width =
- !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
- dst_cm->last_height =
- !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
- dst_cm->subsampling_x = src_cm->subsampling_x;
- dst_cm->subsampling_y = src_cm->subsampling_y;
- dst_cm->frame_type = src_cm->frame_type;
- dst_cm->last_show_frame = !src_cm->show_existing_frame
- ? src_cm->show_frame
- : src_cm->last_show_frame;
- for (i = 0; i < REF_FRAMES; ++i)
- dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
-
- memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
- (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
- dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
- dst_cm->lf.filter_level = src_cm->lf.filter_level;
- memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
- memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
- dst_cm->seg = src_cm->seg;
- memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
- FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
-#else
- (void)dst_worker;
- (void)src_worker;
-#endif // CONFIG_MULTITHREAD
-}
diff --git a/libvpx/vp9/decoder/vp9_dthread.h b/libvpx/vp9/decoder/vp9_dthread.h
deleted file mode 100644
index fce0fe7fe..000000000
--- a/libvpx/vp9/decoder/vp9_dthread.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_DTHREAD_H_
-#define VP9_DECODER_VP9_DTHREAD_H_
-
-#include "./vpx_config.h"
-#include "vpx_util/vpx_thread.h"
-#include "vpx/internal/vpx_codec_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct VP9Common;
-struct VP9Decoder;
-
-// WorkerData for the FrameWorker thread. It contains all the information of
-// the worker and decode structures for decoding a frame.
-typedef struct FrameWorkerData {
- struct VP9Decoder *pbi;
- const uint8_t *data;
- const uint8_t *data_end;
- size_t data_size;
- void *user_priv;
- int result;
- int worker_id;
- int received_frame;
-
- // scratch_buffer is used in frame parallel mode only.
- // It is used to make a copy of the compressed data.
- uint8_t *scratch_buffer;
- size_t scratch_buffer_size;
-
-#if CONFIG_MULTITHREAD
- pthread_mutex_t stats_mutex;
- pthread_cond_t stats_cond;
-#endif
-
- int frame_context_ready; // Current frame's context is ready to read.
- int frame_decoded; // Finished decoding current frame.
-} FrameWorkerData;
-
-void vp9_frameworker_lock_stats(VPxWorker *const worker);
-void vp9_frameworker_unlock_stats(VPxWorker *const worker);
-void vp9_frameworker_signal_stats(VPxWorker *const worker);
-
-// Wait until ref_buf has been decoded to row in real pixel unit.
-// Note: worker may already finish decoding ref_buf and release it in order to
-// start decoding next frame. So need to check whether worker is still decoding
-// ref_buf.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
- int row);
-
-// FrameWorker broadcasts its decoding progress so other workers that are
-// waiting on it can resume decoding.
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
-
-// Copy necessary decoding context from src worker to dst worker.
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
- VPxWorker *const src_worker);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
new file mode 100644
index 000000000..e46f789ba
--- /dev/null
+++ b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -0,0 +1,843 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+// Note: The scaling functions could write extra rows and columns in dst, which
+// exceed the right and bottom boundaries of the destination frame. We rely on
+// the following frame extension function to fix these rows and columns.
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const int src_stride,
+ uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h) {
+ const int max_width = (w + 15) & ~15;
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ const uint8x16x2_t s = vld2q_u8(src);
+ vst1q_u8(dst, s.val[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+ const int src_stride,
+ uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h) {
+ const int max_width = (w + 15) & ~15;
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ const uint8x16x4_t s = vld4q_u8(src);
+ vst1q_u8(dst, s.val[0]);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+ const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+ const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+ uint8_t *const dst) {
+ const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+ const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+ const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+ const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+ const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+ const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+ const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+ const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+ const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07
+ const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F
+ const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17
+ const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F
+ const uint16x8_t v0 = vmull_u8(hor0, coef0);
+ const uint16x8_t v1 = vmull_u8(hor1, coef0);
+ const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+ const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+ // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+ vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+ const uint8_t *const src, const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w, const int h, const int16_t c0,
+ const int16_t c1) {
+ const int max_width = (w + 15) & ~15;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + src_stride;
+ const uint8x8_t coef0 = vdup_n_u8(c0);
+ const uint8x8_t coef1 = vdup_n_u8(c1);
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E
+ // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F
+ const uint8x16x2_t s0 = vld2q_u8(src0);
+ // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E
+ // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F
+ const uint8x16x2_t s1 = vld2q_u8(src1);
+ scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+ coef0, coef1, dst);
+ src0 += 32;
+ src1 += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src0 += 2 * (src_stride - max_width);
+ src1 += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+ const uint8_t *const src, const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w, const int h, const int16_t c0,
+ const int16_t c1) {
+ const int max_width = (w + 15) & ~15;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + src_stride;
+ const uint8x8_t coef0 = vdup_n_u8(c0);
+ const uint8x8_t coef1 = vdup_n_u8(c1);
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ // (*) -- useless
+ // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C
+ // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D
+ // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*)
+ // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*)
+ const uint8x16x4_t s0 = vld4q_u8(src0);
+ // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C
+ // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D
+ // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*)
+ // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*)
+ const uint8x16x4_t s1 = vld4q_u8(src1);
+ scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+ coef0, coef1, dst);
+ src0 += 64;
+ src1 += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src0 += 4 * (src_stride - max_width);
+ src1 += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+ const uint8x8_t *const coef) {
+ const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+ const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+ return vrshrn_n_u16(h1, 7);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 3) & ~3;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 3) & ~3;
+ const int16x8_t filters = vld1q_s16(coef);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[14], d[4];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+ // horizontal 4x8
+ // Note: processing 4x8 is about 20% faster than processing row by row using
+ // vld4_u8().
+ do {
+ load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
+ transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13]);
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
+ d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71
+ d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72
+ d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+ vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+ 1);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+
+ t += 4;
+ x -= 4;
+ } while (x);
+ src += 8 * src_stride - 2 * width_hor;
+ t += 7 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x4
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += 6 * width_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
+ t += 8 * width_hor;
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
+ d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17
+ d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27
+ d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+
+ dst += 4 * dst_stride;
+ y -= 4;
+ } while (y);
+ t -= width_hor * (2 * height_ver + 6);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 1) & ~1;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 1) & ~1;
+ const int16x8_t filters = vld1q_s16(coef);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[12], d[2];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+ // horizontal 2x8
+ // Note: processing 2x8 is about 20% faster than processing row by row using
+ // vld4_u8().
+ do {
+ load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+ x = width_hor;
+
+ do {
+ uint8x8x2_t dd;
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
+ transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
+ &s[11]);
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
+ d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71
+ // dd.val[0]: 00 01 20 21 40 41 60 61
+ // dd.val[1]: 10 11 30 31 50 51 70 71
+ dd = vtrn_u8(d[0], d[1]);
+ vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 0);
+ vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 0);
+ vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 1);
+ vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 1);
+ vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 2);
+ vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 2);
+ vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 3);
+ vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 3);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+
+ t += 2;
+ x -= 2;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor;
+ t += 7 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x2
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+ t += 4 * width_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
+ t += 8 * width_hor;
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
+ d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ t -= width_hor * (4 * height_ver + 4);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+ const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h, const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = width_hor + 2; // store 2 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We only need 1 extra row below because there are only 2 bilinear
+ // coefficients.
+ const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[9], d[8], c[6];
+
+ assert(w && h);
+
+ c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]);
+ c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]);
+ c[2] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+ SUBPEL_MASK][3]);
+ c[3] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+ SUBPEL_MASK][4]);
+ c[4] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+ SUBPEL_MASK][3]);
+ c[5] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+ SUBPEL_MASK][4]);
+
+ d[6] = vdup_n_u8(0);
+ d[7] = vdup_n_u8(0);
+
+ // horizontal 6x8
+ do {
+ load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ src += 1;
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
+ src += 8;
+ transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = scale_filter_bilinear(&s[0], &c[0]);
+ d[1] =
+ scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+ d[2] =
+ scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+ d[3] = scale_filter_bilinear(&s[4], &c[0]);
+ d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+ &c[2]);
+ d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+ &c[4]);
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+ // 60 61 62 63 64 65 xx xx
+ // 70 71 72 73 74 75 xx xx
+ transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ // store 2 extra pixels
+ vst1_u8(t + 0 * stride_hor, d[0]);
+ vst1_u8(t + 1 * stride_hor, d[1]);
+ vst1_u8(t + 2 * stride_hor, d[2]);
+ vst1_u8(t + 3 * stride_hor, d[3]);
+ vst1_u8(t + 4 * stride_hor, d[4]);
+ vst1_u8(t + 5 * stride_hor, d[5]);
+ vst1_u8(t + 6 * stride_hor, d[6]);
+ vst1_u8(t + 7 * stride_hor, d[7]);
+
+ s[0] = s[8];
+
+ t += 6;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3 - 1;
+ t += 7 * stride_hor + 2;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += stride_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
+ t += 8 * stride_hor;
+
+ d[0] = scale_filter_bilinear(&s[0], &c[0]);
+ d[1] =
+ scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+ d[2] =
+ scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+ d[3] = scale_filter_bilinear(&s[4], &c[0]);
+ d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+ &c[2]);
+ d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+ &c[4]);
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+ vst1_u8(dst + 4 * dst_stride, d[4]);
+ vst1_u8(dst + 5 * dst_stride, d[5]);
+
+ s[0] = s[8];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * (4 * height_ver / 3 + 1);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = width_hor + 2; // store 2 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ const int16x8_t filters0 =
+ vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+ const int16x8_t filters1 =
+ vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+ const int16x8_t filters2 =
+ vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[15], d[8];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+ d[6] = vdup_n_u8(0);
+ d[7] = vdup_n_u8(0);
+
+ // horizontal 6x8
+ do {
+ load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13], &s[14]);
+ transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
+ &s[14]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = scale_filter_8(&s[0], filters0);
+ d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+ d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+ d[3] = scale_filter_8(&s[4], filters0);
+ d[4] =
+ scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+ d[5] =
+ scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+ // 60 61 62 63 64 65 xx xx
+ // 70 71 72 73 74 75 xx xx
+ transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ // store 2 extra pixels
+ vst1_u8(t + 0 * stride_hor, d[0]);
+ vst1_u8(t + 1 * stride_hor, d[1]);
+ vst1_u8(t + 2 * stride_hor, d[2]);
+ vst1_u8(t + 3 * stride_hor, d[3]);
+ vst1_u8(t + 4 * stride_hor, d[4]);
+ vst1_u8(t + 5 * stride_hor, d[5]);
+ vst1_u8(t + 6 * stride_hor, d[6]);
+ vst1_u8(t + 7 * stride_hor, d[7]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+ s[6] = s[14];
+
+ t += 6;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 7 * stride_hor + 2;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += 7 * stride_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13], &s[14]);
+ t += 8 * stride_hor;
+
+ d[0] = scale_filter_8(&s[0], filters0);
+ d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+ d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+ d[3] = scale_filter_8(&s[4], filters0);
+ d[4] =
+ scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+ d[5] =
+ scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+ vst1_u8(dst + 4 * dst_stride, d[4]);
+ vst1_u8(dst + 5 * dst_stride, d[5]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+ s[6] = s[14];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * (4 * height_ver / 3 + 7);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ INTERP_FILTER filter_type,
+ int phase_scaler) {
+ const int src_w = src->y_crop_width;
+ const int src_h = src->y_crop_height;
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ const int dst_uv_w = dst_w / 2;
+ const int dst_uv_h = dst_h / 2;
+ int scaled = 0;
+
+ // phase_scaler is usually 0 or 8.
+ assert(phase_scaler >= 0 && phase_scaler < 16);
+
+ if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+ // 2 to 1
+ scaled = 1;
+ if (phase_scaler == 0) {
+ scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0, c1);
+ scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ } else {
+ const int buffer_stride = (dst_w + 3) & ~3;
+ const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scale_plane_2_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_2_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_2_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+ // 4 to 1
+ scaled = 1;
+ if (phase_scaler == 0) {
+ scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0, c1);
+ scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ } else {
+ const int buffer_stride = (dst_w + 1) & ~1;
+ const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scale_plane_4_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_4_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_4_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+ // 4 to 3
+ const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+ const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scaled = 1;
+ if (filter_type == BILINEAR) {
+ scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, phase_scaler,
+ temp_buffer);
+ scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride,
+ dst->u_buffer, dst->uv_stride, dst_uv_w,
+ dst_uv_h, phase_scaler, temp_buffer);
+ scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride,
+ dst->v_buffer, dst->uv_stride, dst_uv_w,
+ dst_uv_h, phase_scaler, temp_buffer);
+ } else {
+ scale_plane_4_to_3_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+ scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type],
+ phase_scaler, temp_buffer);
+ scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type],
+ phase_scaler, temp_buffer);
+ }
+ free(temp_buffer);
+ }
+ }
+
+ if (scaled) {
+ vpx_extend_frame_borders(dst);
+ } else {
+ // Call c version for all other scaling ratios.
+ vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+ }
+}
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 0b175969b..97a09bdff 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -9,9 +9,10 @@
*/
#include <arm_neon.h>
-
+#include <assert.h>
#include <math.h>
+#include "./vpx_config.h"
#include "vpx_mem/vpx_mem.h"
#include "vp9/common/vp9_quant_common.h"
@@ -31,86 +32,206 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ int i;
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_one = vdupq_n_s16(1);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+ int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+ int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+
+ (void)scan;
+ (void)skip_block;
+ assert(!skip_block);
+
+ // adjust for dc
+ v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+ v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+ // process dc and the first seven ac coeffs
+ {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+ v_round = vmovq_n_s16(round_ptr[1]);
+ v_quant = vmovq_n_s16(quant_ptr[1]);
+ v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ }
+ // now process the rest of the ac coeffs
+ for (i = 8; i < count; i += 8) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+ store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
+ }
+ {
+ const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+ vget_high_s16(v_eobmax_76543210));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ }
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan_ptr) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+
+ // ROUND_POWER_OF_TWO(round_ptr[], 1)
+ const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+ const int16x8_t quant = vld1q_s16(quant_ptr);
+ const int16x4_t dequant = vld1_s16(dequant_ptr);
+ // dequant >> 2 is used similar to zbin as a threshold.
+ const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+
+ // Process dc and the first seven ac coeffs.
+ const uint16x8_t iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+ const int16x8_t dequant_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
+
+ int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
+ int32x4_t dqcoeff_0, dqcoeff_1;
+ int16x8_t dqcoeff;
+ uint16x8_t eob_max;
(void)scan;
+ (void)count;
+ (void)skip_block;
+ assert(!skip_block);
+
+ // coeff * quant_ptr[]) >> 15
+ qcoeff = vqdmulhq_s16(qcoeff, quant);
+
+ // Restore sign.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+ qcoeff = vandq_s16(qcoeff, dequant_mask);
+
+ // qcoeff * dequant[] / 2
+ dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
+ dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
- if (!skip_block) {
- // Quantization pass: All coefficients with index >= zero_flag are
- // skippable. Note: zero_flag can be zero.
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+ iscan_ptr += 8;
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+
+ {
int i;
- const int16x8_t v_zero = vdupq_n_s16(0);
- const int16x8_t v_one = vdupq_n_s16(1);
- int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
- int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
- int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
- int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
- // adjust for dc
- v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
- v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
- v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
- // process dc and the first seven ac coeffs
- {
- const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
- const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
- const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
- const int32x4_t v_tmp_lo =
- vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
- const int32x4_t v_tmp_hi =
- vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
- const int16x8_t v_tmp2 =
- vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
- const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
- const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
- const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
- const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
- const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
- const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
- v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
- store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
- store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
- v_round = vmovq_n_s16(round_ptr[1]);
- v_quant = vmovq_n_s16(quant_ptr[1]);
- v_dequant = vmovq_n_s16(dequant_ptr[1]);
- }
- // now process the rest of the ac coeffs
- for (i = 8; i < count; i += 8) {
- const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
- const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
- const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
- const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
- const int32x4_t v_tmp_lo =
- vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
- const int32x4_t v_tmp_hi =
- vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
- const int16x8_t v_tmp2 =
- vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
- const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
- const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
- const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
- const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
- const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
- const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
- v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
- store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
- store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
+ const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
+ const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
+ const int16x8_t dequant_thresh =
+ vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
+
+ // Process the rest of the ac coeffs.
+ for (i = 8; i < 32 * 32; i += 8) {
+ const uint16x8_t iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+ const int16x8_t dequant_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
+
+ int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
+ int32x4_t dqcoeff_0, dqcoeff_1;
+ int16x8_t dqcoeff;
+
+ qcoeff = vqdmulhq_s16(qcoeff, quant);
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+ qcoeff = vandq_s16(qcoeff, dequant_mask);
+
+ dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
+ dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
+
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff =
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+ iscan_ptr += 8;
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
}
+
{
- const int16x4_t v_eobmax_3210 = vmax_s16(
- vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
- const int64x1_t v_eobmax_xx32 =
- vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
- const int16x4_t v_eobmax_tmp =
- vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
- const int64x1_t v_eobmax_xxx3 =
- vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
- const int16x4_t v_eobmax_final =
- vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
- *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
- } else {
- memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
- memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
- *eob_ptr = 0;
}
}
diff --git a/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/libvpx/vp9/encoder/vp9_alt_ref_aq.c
index 3aeefb584..acc3764c7 100644
--- a/libvpx/vp9/encoder/vp9_alt_ref_aq.c
+++ b/libvpx/vp9/encoder/vp9_alt_ref_aq.c
@@ -15,7 +15,7 @@ struct ALT_REF_AQ {
int dummy;
};
-struct ALT_REF_AQ *vp9_alt_ref_aq_create() {
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
}
diff --git a/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libvpx/vp9/encoder/vp9_alt_ref_aq.h
index 18acd8a85..e508cb44a 100644
--- a/libvpx/vp9/encoder/vp9_alt_ref_aq.h
+++ b/libvpx/vp9/encoder/vp9_alt_ref_aq.h
@@ -54,7 +54,7 @@ struct ALT_REF_AQ;
*
* \return Instance of the class
*/
-struct ALT_REF_AQ *vp9_alt_ref_aq_create();
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
/*!\brief Upload segmentation_map to self object
*
diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 048ea629f..2f2f0055a 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -425,9 +425,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
int target_refresh = 0;
double weight_segment_target = 0;
double weight_segment = 0;
+ int thresh_low_motion = (cm->width < 720) ? 55 : 20;
cr->apply_cyclic_refresh = 1;
if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
- (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+ (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
rc->frames_since_key > 40)) {
cr->apply_cyclic_refresh = 0;
return;
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 8433f4edd..d346cd57a 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -919,7 +919,9 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
}
}
-static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
+static int encode_tile_worker(void *arg1, void *arg2) {
+ VP9_COMP *cpi = (VP9_COMP *)arg1;
+ VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2;
MACROBLOCKD *const xd = &data->xd;
const int tile_row = 0;
vpx_start_encode(&data->bit_writer, data->dest);
@@ -995,7 +997,7 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
}
worker->data1 = cpi;
worker->data2 = data;
- worker->hook = (VPxWorkerHook)encode_tile_worker;
+ worker->hook = encode_tile_worker;
worker->had_error = 0;
if (i < num_workers - 1) {
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index ab488f48f..724205dd5 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -172,6 +172,14 @@ struct macroblock {
uint8_t last_sb_high_content;
+ int sb_use_mv_part;
+
+ int sb_mvcol_part;
+
+ int sb_mvrow_part;
+
+ int sb_pickmode_part;
+
// For each superblock: saves the content value (e.g., low/high sad/sumdiff)
// based on source sad, prior to encoding the frame.
uint8_t content_state_sb;
@@ -181,11 +189,15 @@ struct macroblock {
// 32x32, 9~24 for 16x16.
uint8_t variance_low[25];
- void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
- void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+ uint8_t arf_frame_usage;
+ uint8_t lastgolden_frame_usage;
+
+ void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
+ void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
#if CONFIG_VP9_HIGHBITDEPTH
- void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride,
- int eob, int bd);
+ void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest,
+ int stride, int eob, int bd);
#endif
};
diff --git a/libvpx/vp9/encoder/vp9_context_tree.h b/libvpx/vp9/encoder/vp9_context_tree.h
index 9e4cbb360..73423c075 100644
--- a/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/libvpx/vp9/encoder/vp9_context_tree.h
@@ -65,6 +65,7 @@ typedef struct {
int_mv best_sse_mv;
MV_REFERENCE_FRAME best_reference_frame;
MV_REFERENCE_FRAME best_zeromv_reference_frame;
+ int sb_skip_denoising;
#endif
// motion vector cache for adaptive motion search control in partition
diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c
index e6933f00d..b08ccaa66 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/libvpx/vp9/encoder/vp9_denoiser.c
@@ -21,8 +21,6 @@
#include "vp9/encoder/vp9_denoiser.h"
#include "vp9/encoder/vp9_encoder.h"
-// OUTPUT_YUV_DENOISED
-
#ifdef OUTPUT_YUV_DENOISED
static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
#endif
@@ -190,11 +188,13 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
- int num_spatial_layers, int width) {
+ int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
+ int use_svc, int spatial_layer) {
const int sse_diff = (ctx->newmv_sse == UINT_MAX)
? 0
: ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
- MV_REFERENCE_FRAME frame;
+ int frame;
+ int denoise_layer_idx = 0;
MACROBLOCKD *filter_mbd = &mb->e_mbd;
MODE_INFO *mi = filter_mbd->mi[0];
MODE_INFO saved_mi;
@@ -202,8 +202,10 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
struct buf_2d saved_dst[MAX_MB_PLANE];
struct buf_2d saved_pre[MAX_MB_PLANE];
RefBuffer *saved_block_refs[2];
+ MV_REFERENCE_FRAME saved_frame;
frame = ctx->best_reference_frame;
+
saved_mi = *mi;
if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK;
@@ -217,7 +219,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
// If the best reference frame uses inter-prediction and there is enough of a
// difference in sum-squared-error, use it.
- if (frame != INTRA_FRAME &&
+ if (frame != INTRA_FRAME && frame != ALTREF_FRAME &&
(frame != GOLDEN_FRAME || num_spatial_layers == 1) &&
sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
mi->ref_frame[0] = ctx->best_reference_frame;
@@ -228,7 +230,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
frame = ctx->best_zeromv_reference_frame;
ctx->newmv_sse = ctx->zeromv_sse;
// Bias to last reference.
- if (num_spatial_layers > 1 ||
+ if (num_spatial_layers > 1 || frame == ALTREF_FRAME ||
(frame != LAST_FRAME &&
((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
denoiser->denoising_level >= kDenHigh))) {
@@ -246,6 +248,19 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
}
}
+ saved_frame = frame;
+ // When using SVC, we need to map REF_FRAME to the frame buffer index.
+ if (use_svc) {
+ if (frame == LAST_FRAME)
+ frame = lst_fb_idx + 1;
+ else if (frame == GOLDEN_FRAME)
+ frame = gld_fb_idx + 1;
+ // Shift for the second spatial layer.
+ if (num_spatial_layers - spatial_layer == 2)
+ frame = frame + denoiser->num_ref_frames;
+ denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+ }
+
if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
// Restore everything to its original state
*mi = saved_mi;
@@ -279,20 +294,23 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
- filter_mbd->plane[0].dst.buf =
- block_start(denoiser->mc_running_avg_y.y_buffer,
- denoiser->mc_running_avg_y.y_stride, mi_row, mi_col);
- filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
- filter_mbd->plane[1].dst.buf =
- block_start(denoiser->mc_running_avg_y.u_buffer,
- denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
- filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
- filter_mbd->plane[2].dst.buf =
- block_start(denoiser->mc_running_avg_y.v_buffer,
- denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
- filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
-
- set_ref_ptrs(cm, filter_mbd, frame, NONE);
+ filter_mbd->plane[0].dst.buf = block_start(
+ denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
+ denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
+ filter_mbd->plane[0].dst.stride =
+ denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
+ filter_mbd->plane[1].dst.buf = block_start(
+ denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+ filter_mbd->plane[1].dst.stride =
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+ filter_mbd->plane[2].dst.buf = block_start(
+ denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+ filter_mbd->plane[2].dst.stride =
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+
+ set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
// Restore everything to its original state
@@ -314,9 +332,17 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
int zeromv_filter = 0;
VP9_DENOISER *denoiser = &cpi->denoiser;
VP9_DENOISER_DECISION decision = COPY_BLOCK;
- YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
- YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+
+ const int shift =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+ ? denoiser->num_ref_frames
+ : 0;
+ YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+ const int denoise_layer_index =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+ YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
uint8_t *mc_avg_start =
block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
struct buf_2d src = mb->plane[0].src;
@@ -338,8 +364,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
VP9_COMMON *const cm = &cpi->common;
int j, i;
// Loop through the 8x8 sub-blocks.
- const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
- const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+ const int bw = num_8x8_blocks_wide_lookup[bs];
+ const int bh = num_8x8_blocks_high_lookup[bs];
const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
const int block_index = mi_row * cm->mi_cols + mi_col;
@@ -366,14 +392,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
}
if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
- // TODO(marpan): There is an issue with denoising for speed 5,
- // due to the partitioning scheme based on pickmode.
- // Remove this speed constraint when issue is resolved.
- if (denoiser->denoising_level >= kDenLow && cpi->oxcf.speed > 5)
+ if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising)
decision = perform_motion_compensation(
&cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
- cpi->svc.number_spatial_layers, cpi->Source->y_width);
+ cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
+ cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id);
if (decision == FILTER_BLOCK) {
decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -382,12 +406,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
}
if (decision == FILTER_BLOCK) {
- vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0,
- NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+ vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,
+ 0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
} else { // COPY_BLOCK
- vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0,
- NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+ vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,
+ 0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
}
*denoiser_decision = decision;
@@ -423,7 +447,9 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
void vp9_denoiser_update_frame_info(
VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
- int resized, int svc_base_is_key) {
+ int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
+ int svc_base_is_key, int second_spatial_layer) {
+ const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
// Copy source into denoised reference buffers on KEY_FRAME or
// if the just encoded frame was resized. For SVC, copy source if the base
// spatial layer was key frame.
@@ -431,8 +457,10 @@ void vp9_denoiser_update_frame_info(
svc_base_is_key) {
int i;
// Start at 1 so as not to overwrite the INTRA_FRAME
- for (i = 1; i < MAX_REF_FRAMES; ++i)
- copy_frame(&denoiser->running_avg_y[i], &src);
+ for (i = 1; i < denoiser->num_ref_frames; ++i) {
+ if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+ copy_frame(&denoiser->running_avg_y[i + shift], &src);
+ }
denoiser->reset = 0;
return;
}
@@ -440,29 +468,29 @@ void vp9_denoiser_update_frame_info(
// If more than one refresh occurs, must copy frame buffer.
if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
if (refresh_alt_ref_frame) {
- copy_frame(&denoiser->running_avg_y[ALTREF_FRAME],
- &denoiser->running_avg_y[INTRA_FRAME]);
+ copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
}
if (refresh_golden_frame) {
- copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME],
- &denoiser->running_avg_y[INTRA_FRAME]);
+ copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
}
if (refresh_last_frame) {
- copy_frame(&denoiser->running_avg_y[LAST_FRAME],
- &denoiser->running_avg_y[INTRA_FRAME]);
+ copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
}
} else {
if (refresh_alt_ref_frame) {
- swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
- &denoiser->running_avg_y[INTRA_FRAME]);
+ swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
}
if (refresh_golden_frame) {
- swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
- &denoiser->running_avg_y[INTRA_FRAME]);
+ swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
}
if (refresh_last_frame) {
- swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
- &denoiser->running_avg_y[INTRA_FRAME]);
+ swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
}
}
}
@@ -491,19 +519,110 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
}
}
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
- int ssy,
+static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
+ VP9_DENOISER *denoiser, int fb_idx) {
+ int fail = 0;
+ if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+ fail =
+ vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width,
+ cm->height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, 0);
+ if (fail) {
+ vp9_denoiser_free(denoiser);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+ int svc_buf_shift, int refresh_alt,
+ int refresh_gld, int refresh_lst, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx) {
+ int fail = 0;
+ if (refresh_alt) {
+ // Increase the frame buffer index by 1 to map it to the buffer index in the
+ // denoiser.
+ fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+ alt_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_gld) {
+ fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+ gld_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_lst) {
+ fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+ lst_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ return 0;
+}
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy,
#if CONFIG_VP9_HIGHBITDEPTH
int use_highbitdepth,
#endif
int border) {
- int i, fail;
+ int i, layer, fail, init_num_ref_frames;
const int legacy_byte_alignment = 0;
+ int num_layers = 1;
+ int scaled_width = width;
+ int scaled_height = height;
+ if (use_svc) {
+ LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ get_layer_resolution(width, height, lc->scaling_factor_num,
+ lc->scaling_factor_den, &scaled_width, &scaled_height);
+ // For SVC: only denoise at most 2 spatial (highest) layers.
+ if (noise_sen >= 2)
+ // Denoise from one spatial layer below the top.
+ svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
+ else
+ // Only denoise the top spatial layer.
+ svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
+ num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+ }
assert(denoiser != NULL);
+ denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+ init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
+ denoiser->num_layers = num_layers;
+ CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+ vpx_calloc(denoiser->num_ref_frames * num_layers,
+ sizeof(denoiser->running_avg_y[0])));
+ CHECK_MEM_ERROR(
+ cm, denoiser->mc_running_avg_y,
+ vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+ for (layer = 0; layer < num_layers; ++layer) {
+ const int denoise_width = (layer == 0) ? width : scaled_width;
+ const int denoise_height = (layer == 0) ? height : scaled_height;
+ for (i = 0; i < init_num_ref_frames; ++i) {
+ fail = vpx_alloc_frame_buffer(
+ &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+ denoise_width, denoise_height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border, legacy_byte_alignment);
+ if (fail) {
+ vp9_denoiser_free(denoiser);
+ return 1;
+ }
+#ifdef OUTPUT_YUV_DENOISED
+ make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+ }
- for (i = 0; i < MAX_REF_FRAMES; ++i) {
- fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
- ssx, ssy,
+ fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
+ denoise_width, denoise_height, ssx, ssy,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
@@ -512,22 +631,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
vp9_denoiser_free(denoiser);
return 1;
}
-#ifdef OUTPUT_YUV_DENOISED
- make_grayscale(&denoiser->running_avg_y[i]);
-#endif
- }
-
- fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx,
- ssy,
-#if CONFIG_VP9_HIGHBITDEPTH
- use_highbitdepth,
-#endif
- border, legacy_byte_alignment);
- if (fail) {
- vp9_denoiser_free(denoiser);
- return 1;
}
+ // denoiser->last_source only used for noise_estimation, so only for top
+ // layer.
fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
@@ -553,10 +660,18 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
return;
}
denoiser->frame_buffer_initialized = 0;
- for (i = 0; i < MAX_REF_FRAMES; ++i) {
+ for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
}
- vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+ vpx_free(denoiser->running_avg_y);
+ denoiser->running_avg_y = NULL;
+
+ for (i = 0; i < denoiser->num_layers; ++i) {
+ vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+ }
+
+ vpx_free(denoiser->mc_running_avg_y);
+ denoiser->mc_running_avg_y = NULL;
vpx_free_frame_buffer(&denoiser->last_source);
}
@@ -570,7 +685,8 @@ void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) {
denoiser->prev_denoising_level = denoiser->denoising_level;
}
-// Scale/increase the partition threshold for denoiser speed-up.
+// Scale/increase the partition threshold
+// for denoiser speed-up.
int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
int content_state, int temporal_layer_id) {
if ((content_state == kLowSadLowSumdiff) ||
@@ -585,7 +701,8 @@ int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
}
}
-// Scale/increase the ac skip threshold for denoiser speed-up.
+// Scale/increase the ac skip threshold for
+// denoiser speed-up.
int64_t vp9_scale_acskip_thresh(int64_t threshold,
VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
int temporal_layer_id) {
diff --git a/libvpx/vp9/encoder/vp9_denoiser.h b/libvpx/vp9/encoder/vp9_denoiser.h
index f0845e113..f4da24cbf 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/libvpx/vp9/encoder/vp9_denoiser.h
@@ -21,6 +21,14 @@ extern "C" {
#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
typedef enum vp9_denoiser_decision {
COPY_BLOCK,
FILTER_BLOCK,
@@ -35,11 +43,13 @@ typedef enum vp9_denoiser_level {
} VP9_DENOISER_LEVEL;
typedef struct vp9_denoiser {
- YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
- YV12_BUFFER_CONFIG mc_running_avg_y;
+ YV12_BUFFER_CONFIG *running_avg_y;
+ YV12_BUFFER_CONFIG *mc_running_avg_y;
YV12_BUFFER_CONFIG last_source;
int frame_buffer_initialized;
int reset;
+ int num_ref_frames;
+ int num_layers;
VP9_DENOISER_LEVEL denoising_level;
VP9_DENOISER_LEVEL prev_denoising_level;
} VP9_DENOISER;
@@ -57,11 +67,13 @@ typedef struct {
} VP9_PICKMODE_CTX_DEN;
struct VP9_COMP;
+struct SVC;
void vp9_denoiser_update_frame_info(
VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
- int resized, int svc_base_is_key);
+ int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
+ int svc_base_is_key, int second_spatial_layer);
void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
@@ -73,8 +85,14 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
PREDICTION_MODE mode,
PICK_MODE_CONTEXT *ctx);
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
- int ssy,
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+ int svc_buf_shift, int refresh_alt,
+ int refresh_gld, int refresh_lst, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx);
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy,
#if CONFIG_VP9_HIGHBITDEPTH
int use_highbitdepth,
#endif
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 6215e198c..682477df1 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -125,19 +125,17 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
};
#endif // CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
- const struct buf_2d *ref,
- BLOCK_SIZE bs) {
+unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+ BLOCK_SIZE bs) {
unsigned int sse;
const unsigned int var =
cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse);
- return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+ return var;
}
#if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
- const struct buf_2d *ref,
- BLOCK_SIZE bs, int bd) {
+unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd) {
unsigned int var, sse;
switch (bd) {
case 10:
@@ -157,8 +155,24 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse);
break;
}
- return (unsigned int)ROUND64_POWER_OF_TWO((int64_t)var,
- num_pels_log2_lookup[bs]);
+ return var;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs) {
+ return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs),
+ num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd) {
+ return (unsigned int)ROUND64_POWER_OF_TWO(
+ (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd),
+ num_pels_log2_lookup[bs]);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -287,8 +301,12 @@ static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x,
}
typedef struct {
- int64_t sum_square_error;
- int64_t sum_error;
+ // This struct is used for computing variance in choose_partitioning(), where
+ // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even
+ // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16
+ // * 16 = 2^32).
+ uint32_t sum_square_error;
+ int32_t sum_error;
int log2_count;
int variance;
} var;
@@ -381,7 +399,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
}
// Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
v->sum_square_error = s2;
v->sum_error = s;
v->log2_count = c;
@@ -489,8 +507,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
return 0;
}
-int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width,
- int height, int content_state) {
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+ int width, int height,
+ int content_state) {
if (speed >= 8) {
if (width <= 640 && height <= 480)
return (5 * threshold_base) >> 2;
@@ -554,6 +573,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
#endif
thresholds[0] = threshold_base;
thresholds[2] = threshold_base << cpi->oxcf.speed;
+ if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7)
+ thresholds[2] = thresholds[2] << 1;
if (cm->width <= 352 && cm->height <= 288) {
thresholds[0] = threshold_base >> 3;
thresholds[1] = threshold_base >> 1;
@@ -742,16 +763,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
for (i = 0; i < ymis; i += 2) {
for (j = 0; j < xmis; j += 2) {
int bl_index = block_index + i * cm->mi_cols + j;
- int bl_index1 = bl_index + 1;
- int bl_index2 = bl_index + cm->mi_cols;
- int bl_index3 = bl_index2 + 1;
- int consec_zeromv =
- VPXMIN(cpi->consec_zero_mv[bl_index],
- VPXMIN(cpi->consec_zero_mv[bl_index1],
- VPXMIN(cpi->consec_zero_mv[bl_index2],
- cpi->consec_zero_mv[bl_index3])));
- int is_skin = vp9_compute_skin_block(
- ysignal, usignal, vsignal, sp, spuv, BLOCK_16X16, consec_zeromv, 0);
+ int is_skin = cpi->skin_map[bl_index];
num_16x16_skin += is_skin;
num_16x16_nonskin += (1 - is_skin);
if (num_16x16_nonskin > 3) {
@@ -849,7 +861,7 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
int start_pos = mi_row * cm->mi_stride + mi_col;
const int bsl = b_width_log2_lookup[bsize];
- const int bs = (1 << bsl) / 4;
+ const int bs = (1 << bsl) >> 2;
BLOCK_SIZE subsize;
PARTITION_TYPE partition;
@@ -895,10 +907,7 @@ static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
cpi->svc.number_temporal_layers);
const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
- if (lc->is_key_frame ||
- (cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1 &&
- cpi->svc.number_temporal_layers > 1))
- svc_copy_allowed = 0;
+ if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0;
frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
}
if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
@@ -917,13 +926,165 @@ static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
return 0;
}
-static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
- int mi_col) {
+static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int mi_row_high, int mi_col_high) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ BLOCK_SIZE *prev_part = svc->prev_partition_svc;
+ // Variables with _high are for higher resolution.
+ int bsize_high = 0;
+ int subsize_high = 0;
+ const int bsl_high = b_width_log2_lookup[bsize];
+ const int bs_high = (1 << bsl_high) >> 2;
+ const int has_rows = (mi_row_high + bs_high) < cm->mi_rows;
+ const int has_cols = (mi_col_high + bs_high) < cm->mi_cols;
+
+ const int row_boundary_block_scale_factor[BLOCK_SIZES] = {
+ 13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0
+ };
+ const int col_boundary_block_scale_factor[BLOCK_SIZES] = {
+ 13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0
+ };
+ int start_pos;
+ BLOCK_SIZE bsize_low;
+ PARTITION_TYPE partition_high;
+
+ if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0;
+ if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0;
+
+ // Find corresponding (mi_col/mi_row) block down-scaled by 2x2.
+ start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col;
+ bsize_low = prev_part[start_pos];
+ // The block size is too big for boundaries. Do variance based partitioning.
+ if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1;
+
+ // For reference frames: return 1 (do variance-based partitioning) if the
+ // superblock is not low source sad and lower-resoln bsize is below 32x32.
+ if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad &&
+ bsize_low < BLOCK_32X32)
+ return 1;
+
+ // Scale up block size by 2x2. Force 64x64 for size larger than 32x32.
+ if (bsize_low < BLOCK_32X32) {
+ bsize_high = bsize_low + 3;
+ } else if (bsize_low >= BLOCK_32X32) {
+ bsize_high = BLOCK_64X64;
+ }
+ // Scale up blocks on boundary.
+ if (!has_cols && has_rows) {
+ bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low];
+ } else if (has_cols && !has_rows) {
+ bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low];
+ } else if (!has_cols && !has_rows) {
+ bsize_high = bsize_low;
+ }
+
+ partition_high = partition_lookup[bsl_high][bsize_high];
+ subsize_high = get_subsize(bsize, partition_high);
+
+ if (subsize_high < BLOCK_8X8) {
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+ } else {
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) >> 2;
+ switch (partition_high) {
+ case PARTITION_NONE:
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+ break;
+ case PARTITION_HORZ:
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+ if (subsize_high < BLOCK_64X64)
+ set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high,
+ subsize_high);
+ break;
+ case PARTITION_VERT:
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+ if (subsize_high < BLOCK_64X64)
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high,
+ subsize_high);
+ break;
+ case PARTITION_SPLIT:
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
+ mi_row_high, mi_col_high))
+ return 1;
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+ mi_col, mi_row_high + bs_high, mi_col_high))
+ return 1;
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row,
+ mi_col + (bs >> 1), mi_row_high,
+ mi_col_high + bs_high))
+ return 1;
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+ mi_col + (bs >> 1), mi_row_high + bs_high,
+ mi_col_high + bs_high))
+ return 1;
+ break;
+ default: assert(0);
+ }
+ }
+
+ return 0;
+}
+
+static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc;
+ int start_pos = mi_row * cm->mi_stride + mi_col;
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) >> 2;
+ BLOCK_SIZE subsize;
+ PARTITION_TYPE partition;
+ const MODE_INFO *mi = NULL;
+ int xx, yy;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ mi = cm->mi_grid_visible[start_pos];
+ partition = partition_lookup[bsl][mi->sb_type];
+ subsize = get_subsize(bsize, partition);
+ if (subsize < BLOCK_8X8) {
+ prev_part[start_pos] = bsize;
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ prev_part[start_pos] = bsize;
+ if (bsize == BLOCK_64X64) {
+ for (xx = 0; xx < 8; xx += 4)
+ for (yy = 0; yy < 8; yy += 4) {
+ if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols))
+ prev_part[start_pos + xx * cm->mi_stride + yy] = bsize;
+ }
+ }
+ break;
+ case PARTITION_HORZ:
+ prev_part[start_pos] = subsize;
+ if (mi_row + bs < cm->mi_rows)
+ prev_part[start_pos + bs * cm->mi_stride] = subsize;
+ break;
+ case PARTITION_VERT:
+ prev_part[start_pos] = subsize;
+ if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+ break;
+ case PARTITION_SPLIT:
+ update_partition_svc(cpi, subsize, mi_row, mi_col);
+ update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
+ update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
+ update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
BLOCK_SIZE *prev_part = cpi->prev_partition;
int start_pos = mi_row * cm->mi_stride + mi_col;
const int bsl = b_width_log2_lookup[bsize];
- const int bs = (1 << bsl) / 4;
+ const int bs = (1 << bsl) >> 2;
BLOCK_SIZE subsize;
PARTITION_TYPE partition;
const MODE_INFO *mi = NULL;
@@ -948,16 +1109,26 @@ static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
break;
case PARTITION_SPLIT:
- update_prev_partition(cpi, subsize, mi_row, mi_col);
- update_prev_partition(cpi, subsize, mi_row + bs, mi_col);
- update_prev_partition(cpi, subsize, mi_row, mi_col + bs);
- update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+ update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
+ update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
+ update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
+ update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
break;
default: assert(0);
}
}
}
+static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id,
+ int mi_row, int mi_col, int sb_offset) {
+ update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+ cpi->prev_segment_id[sb_offset] = segment_id;
+ memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low,
+ sizeof(x->variance_low));
+ // Reset the counter for copy partitioning
+ cpi->copied_frame_cnt[sb_offset] = 0;
+}
+
static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
unsigned int y_sad, int is_key_frame) {
int i;
@@ -989,8 +1160,8 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
}
}
-static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
- int sb_offset) {
+static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
+ int sb_offset) {
unsigned int tmp_sse;
uint64_t tmp_sad;
unsigned int tmp_variance;
@@ -1002,7 +1173,7 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
uint64_t avg_source_sad_threshold = 10000;
uint64_t avg_source_sad_threshold2 = 12000;
#if CONFIG_VP9_HIGHBITDEPTH
- if (cpi->common.use_highbitdepth) return;
+ if (cpi->common.use_highbitdepth) return 0;
#endif
src_y += shift;
last_src_y += shift;
@@ -1019,8 +1190,12 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
: kHighSadHighSumdiff;
// Detect large lighting change.
- if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000)
+ if (cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+ cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) &&
+ (tmp_sse - tmp_variance) > 10000)
x->content_state_sb = kLowVarHighSumdiff;
+ else if (tmp_sad > (avg_source_sad_threshold << 1))
+ x->content_state_sb = kVeryHighSad;
if (cpi->content_state_sb_fd != NULL) {
if (tmp_sad < avg_source_sad_threshold2) {
@@ -1031,7 +1206,7 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
cpi->content_state_sb_fd[sb_offset] = 0;
}
}
- return;
+ return tmp_sad;
}
// This function chooses partitioning based on the variance between source and
@@ -1042,7 +1217,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
MACROBLOCKD *xd = &x->e_mbd;
int i, j, k, m;
v64x64 vt;
- v16x16 vt2[16];
+ v16x16 *vt2 = NULL;
int force_split[21];
int avg_32x32;
int max_var_32x32 = 0;
@@ -1058,6 +1233,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
const uint8_t *d;
int sp;
int dp;
+ int compute_minmax_variance = 1;
unsigned int y_sad = UINT_MAX;
BLOCK_SIZE bsize = BLOCK_64X64;
// Ref frame used in partitioning.
@@ -1082,6 +1258,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
segment_id = xd->mi[0]->segment_id;
+ if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame))
+ compute_minmax_variance = 0;
+
+ memset(x->variance_low, 0, sizeof(x->variance_low));
+
if (cpi->sf.use_source_sad && !is_key_frame) {
int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
content_state = x->content_state_sb;
@@ -1092,9 +1273,27 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0;
if (cpi->content_state_sb_fd != NULL)
x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
+
+ // For SVC on top spatial layer: use/scale the partition from
+ // the lower spatial resolution if svc_use_lowres_part is enabled.
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+ cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) {
+ if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1,
+ mi_col >> 1, mi_row, mi_col)) {
+ if (cpi->sf.copy_partition_flag) {
+ update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+ }
+ return 0;
+ }
+ }
// If source_sad is low copy the partition without computing the y_sad.
if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+ x->sb_use_mv_part = 1;
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
return 0;
}
}
@@ -1110,8 +1309,6 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
// For non keyframes, disable 4x4 average for low resolution when speed = 8
threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
- memset(x->variance_low, 0, sizeof(x->variance_low));
-
if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
@@ -1171,12 +1368,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
mi->mv[0].as_int = 0;
mi->interp_filter = BILINEAR;
- if (cpi->oxcf.speed >= 8 && !low_res)
+ if (cpi->oxcf.speed >= 8 && !low_res &&
+ x->content_state_sb != kVeryHighSad) {
y_sad = cpi->fn_ptr[bsize].sdf(
x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
xd->plane[0].pre[0].stride);
- else
+ } else {
y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+ x->sb_use_mv_part = 1;
+ x->sb_mvcol_part = mi->mv[0].as_mv.col;
+ x->sb_mvrow_part = mi->mv[0].as_mv.row;
+ }
y_sad_last = y_sad;
// Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
@@ -1197,7 +1399,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
- x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+ if (cpi->use_skin_detection)
+ x->sb_is_skin =
+ skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
d = xd->plane[0].dst.buf;
dp = xd->plane[0].dst.stride;
@@ -1212,6 +1416,12 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
x->variance_low[0] = 1;
chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+ if (cpi->sf.copy_partition_flag) {
+ update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+ }
return 0;
}
}
@@ -1223,6 +1433,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
return 0;
}
} else {
@@ -1240,6 +1453,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+ if (low_res && threshold_4x4avg < INT64_MAX)
+ CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
// Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
// for splits.
for (i = 0; i < 4; i++) {
@@ -1276,7 +1491,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
force_split[split_index] = 1;
force_split[i + 1] = 1;
force_split[0] = 1;
- } else if (cpi->oxcf.speed < 8 &&
+ } else if (compute_minmax_variance &&
vt.split[i].split[j].part_variances.none.variance >
thresholds[1] &&
!cyclic_refresh_segment_id_boosted(segment_id)) {
@@ -1288,7 +1503,10 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
xd->cur_buf->flags,
#endif
pixels_wide, pixels_high);
- if (minmax > cpi->vbp_threshold_minmax) {
+ int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+ if (x->content_state_sb == kVeryHighSad)
+ thresh_minmax = thresh_minmax << 1;
+ if (minmax > thresh_minmax) {
force_split[split_index] = 1;
force_split[i + 1] = 1;
force_split[0] = 1;
@@ -1431,21 +1649,20 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
}
if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) {
- update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
- cpi->prev_segment_id[sb_offset] = segment_id;
- memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low,
- sizeof(x->variance_low));
- // Reset the counter for copy partitioning
- if (cpi->copied_frame_cnt[sb_offset] == cpi->max_copied_frame)
- cpi->copied_frame_cnt[sb_offset] = 0;
+ update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
}
+ if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+
if (cpi->sf.short_circuit_low_temp_var) {
set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition,
mi_col, mi_row);
}
chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+ if (vt2) vpx_free(vt2);
return 0;
}
@@ -3480,7 +3697,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
RD_COST *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx) {
- if (bsize < BLOCK_16X16)
+ if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16)
vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
else
vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
@@ -3644,6 +3861,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
!force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
(void)*tp_orig;
+ // Avoid checking for rectangular partitions for speed >= 6.
+ if (cpi->oxcf.speed >= 6) do_rect = 0;
+
assert(num_8x8_blocks_wide_lookup[bsize] ==
num_8x8_blocks_high_lookup[bsize]);
@@ -3871,6 +4091,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
RD_COST this_rdc;
+ BLOCK_SIZE subsize_ref =
+ (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16;
vp9_rd_cost_reset(&this_rdc);
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
@@ -3884,7 +4106,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
0, INT64_MAX, pc_tree);
} else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
- subsize >= BLOCK_16X16) {
+ subsize >= subsize_ref) {
x->max_partition_size = BLOCK_32X32;
x->min_partition_size = BLOCK_8X8;
nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
@@ -4132,6 +4354,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
(*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
sb_col_in_tile);
+ if (cpi->use_skin_detection) {
+ vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col);
+ }
+
x->source_variance = UINT_MAX;
vp9_zero(x->pred_mv);
vp9_rd_cost_init(&dummy_rdc);
@@ -4141,6 +4367,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
x->skip_low_source_sad = 0;
x->lowvar_highsumdiff = 0;
x->content_state_sb = 0;
+ x->sb_use_mv_part = 0;
+ x->sb_mvcol_part = 0;
+ x->sb_mvrow_part = 0;
+ x->sb_pickmode_part = 0;
+ x->arf_frame_usage = 0;
+ x->lastgolden_frame_usage = 0;
if (seg->enabled) {
const uint8_t *const map =
@@ -4155,7 +4387,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
- avg_source_sad(cpi, x, shift, sb_offset2);
+ int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2);
+ if (sf->adapt_partition_source_sad &&
+ (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref &&
+ source_sad > sf->adapt_partition_thresh &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)))
+ partition_search_type = REFERENCE_PARTITION;
}
// Set the partition type of the 64X64 block
@@ -4181,12 +4418,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
break;
case REFERENCE_PARTITION:
+ x->sb_pickmode_part = 1;
set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
// Use nonrd_pick_partition on scene-cut for VBR mode.
// nonrd_pick_partition does not support 4x4 partition, so avoid it
// on key frame for now.
if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
- cm->frame_type != KEY_FRAME)) {
+ cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
// Use lower max_partition_size for low resoultions.
if (cm->width <= 352 && cm->height <= 288)
x->max_partition_size = BLOCK_32X32;
@@ -4213,12 +4452,34 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
default: assert(0); break;
}
+ // Update ref_frame usage for inter frame if this group is ARF group.
+ if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame &&
+ !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group &&
+ cpi->sf.use_altref_onepass) {
+ int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+ if (cpi->count_arf_frame_usage != NULL)
+ cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage;
+ if (cpi->count_lastgolden_frame_usage != NULL)
+ cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage;
+ }
+
(*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
sb_col_in_tile, num_sb_cols);
}
}
// end RTC play code
+static INLINE uint32_t variance(const diff *const d) {
+ return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE uint32_t variance_highbd(diff *const d) {
+ const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
+ return (var >= 0) ? (uint32_t)var : 0;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
const SPEED_FEATURES *const sf = &cpi->sf;
const VP9_COMMON *const cm = &cpi->common;
@@ -4248,14 +4509,17 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
case VPX_BITS_8:
vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
+ var16->var = variance(var16);
break;
case VPX_BITS_10:
vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
+ var16->var = variance_highbd(var16);
break;
case VPX_BITS_12:
vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
+ var16->var = variance_highbd(var16);
break;
default:
assert(0 &&
@@ -4266,12 +4530,13 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
} else {
vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
&var16->sum);
+ var16->var = variance(var16);
}
#else
vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
&var16->sum);
+ var16->var = variance(var16);
#endif // CONFIG_VP9_HIGHBITDEPTH
- var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8);
if (var16->var >= VAR_HIST_MAX_BG_VAR)
hist[VAR_HIST_BINS - 1]++;
@@ -4482,15 +4747,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+ x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
else
- x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
- x->highbd_itxm_add =
+ x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+ x->highbd_inv_txfm_add =
xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
#else
- x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+ x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
#endif // CONFIG_VP9_HIGHBITDEPTH
- x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+ x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
if (xd->lossless) x->optimize = 0;
@@ -4733,8 +4998,31 @@ void vp9_encode_frame(VP9_COMP *cpi) {
}
}
} else {
+ FRAME_COUNTS *counts = cpi->td.counts;
cm->reference_mode = SINGLE_REFERENCE;
+ if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode &&
+ cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+ cm->frame_type != KEY_FRAME)
+ cm->reference_mode = REFERENCE_MODE_SELECT;
+
encode_frame_internal(cpi);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ int single_count_zero = 0;
+ int comp_count_zero = 0;
+ int i;
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+ single_count_zero += counts->comp_inter[i][0];
+ comp_count_zero += counts->comp_inter[i][1];
+ }
+ if (comp_count_zero == 0) {
+ cm->reference_mode = SINGLE_REFERENCE;
+ vp9_zero(counts->comp_inter);
+ } else if (single_count_zero == 0) {
+ cm->reference_mode = COMPOUND_REFERENCE;
+ vp9_zero(counts->comp_inter);
+ }
+ }
}
// If segmented AQ is enabled compute the average AQ weighting.
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 7e30499c5..f3c17f255 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -49,283 +49,258 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
pd->dst.buf, pd->dst.stride);
}
-typedef struct vp9_token_state {
- int64_t error;
- int rate;
- int16_t next;
- int16_t token;
- tran_low_t qc;
- tran_low_t dqc;
- uint8_t best_index;
-} vp9_token_state;
-
static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
{ 10, 6 }, { 8, 5 },
};
-#define UPDATE_RD_COST() \
- { \
- rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
- rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
- }
-
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,
- int idx, int token, uint8_t *token_cache) {
- int bak = token_cache[scan[idx]], pt;
- token_cache[scan[idx]] = vp9_pt_energy_class[token];
- pt = get_coef_context(nb, token_cache, idx + 1);
- token_cache[scan[idx]] = bak;
- return pt;
-}
+// 'num' can be negative, but 'shift' must be non-negative.
+#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
+ ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
-static const int16_t band_count_table[TX_SIZES][8] = {
- { 1, 2, 3, 4, 3, 16 - 13, 0 },
- { 1, 2, 3, 4, 11, 64 - 21, 0 },
- { 1, 2, 3, 4, 11, 256 - 21, 0 },
- { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-};
-static const int16_t band_cum_count_table[TX_SIZES][8] = {
- { 0, 1, 3, 6, 10, 13, 16, 0 },
- { 0, 1, 3, 6, 10, 21, 64, 0 },
- { 0, 1, 3, 6, 10, 21, 256, 0 },
- { 0, 1, 3, 6, 10, 21, 1024, 0 },
-};
int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
int ctx) {
MACROBLOCKD *const xd = &mb->e_mbd;
struct macroblock_plane *const p = &mb->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
const int ref = is_inter_block(xd->mi[0]);
- vp9_token_state tokens[1025][2];
uint8_t token_cache[1024];
- const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+ const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const int eob = p->eobs[block];
- const PLANE_TYPE type = get_plane_type(plane);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
const int default_eob = 16 << (tx_size << 1);
const int shift = (tx_size == TX_32X32);
const int16_t *const dequant_ptr = pd->dequant;
const uint8_t *const band_translate = get_band_translate(tx_size);
- const scan_order *const so = get_scan(xd, tx_size, type, block);
+ const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
const int16_t *const scan = so->scan;
const int16_t *const nb = so->neighbors;
- const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
- int next = eob, sz = 0;
- const int64_t rdmult = ((int64_t)mb->rdmult * plane_rd_mult[ref][type]) >> 1;
+ const int64_t rdmult =
+ ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
const int64_t rddiv = mb->rddiv;
int64_t rd_cost0, rd_cost1;
- int rate0, rate1;
- int64_t error0, error1;
+ int64_t rate0, rate1;
int16_t t0, t1;
- int best, band = (eob < default_eob) ? band_translate[eob]
- : band_translate[eob - 1];
- int pt, i, final_eob;
+ int i, final_eob;
#if CONFIG_VP9_HIGHBITDEPTH
const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
#else
const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
#endif
- unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
- mb->token_costs[tx_size][type][ref];
- const int16_t *band_counts = &band_count_table[tx_size][band];
- int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
-
- token_costs += band;
-
- assert((!type && !plane) || (type && plane));
+ unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ mb->token_costs[tx_size][plane_type][ref];
+ unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS];
+ int64_t eob_cost0, eob_cost1;
+ const int ctx0 = ctx;
+ int64_t accu_rate = 0;
+ // Initialized to the worst possible error for the largest transform size.
+ // This ensures that it never goes negative.
+ int64_t accu_error = ((int64_t)1) << 50;
+ int64_t best_block_rd_cost = INT64_MAX;
+ int x_prev = 1;
+ tran_low_t before_best_eob_qc = 0;
+ tran_low_t before_best_eob_dqc = 0;
+
+ assert((!plane_type && !plane) || (plane_type && plane));
assert(eob <= default_eob);
- /* Now set up a Viterbi trellis to evaluate alternative roundings. */
- /* Initialize the sentinel node of the trellis. */
- tokens[eob][0].rate = 0;
- tokens[eob][0].error = 0;
- tokens[eob][0].next = default_eob;
- tokens[eob][0].token = EOB_TOKEN;
- tokens[eob][0].qc = 0;
- tokens[eob][1] = tokens[eob][0];
-
for (i = 0; i < eob; i++) {
const int rc = scan[i];
token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
}
+ final_eob = 0;
- for (i = eob; i-- > 0;) {
- int base_bits, d2, dx;
+ // Initial RD cost.
+ token_costs_cur = token_costs + band_translate[0];
+ rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN];
+ best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+ // For each token, pick one of two choices greedily:
+ // (i) First candidate: Keep current quantized value, OR
+ // (ii) Second candidate: Reduce quantized value by 1.
+ for (i = 0; i < eob; i++) {
const int rc = scan[i];
- int x = qcoeff[rc];
- /* Only add a trellis state for non-zero coefficients. */
- if (x) {
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- /* Evaluate the first possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
- /* Consider both possible successor states. */
- if (next < default_eob) {
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
- rate0 += (*token_costs)[0][pt][tokens[next][0].token];
- rate1 += (*token_costs)[0][pt][tokens[next][1].token];
- }
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
- dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+ const int x = qcoeff[rc];
+ const int band_cur = band_translate[i];
+ const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+ const int token_tree_sel_cur = (x_prev == 0);
+ token_costs_cur = token_costs + band_cur;
+ if (x == 0) { // No need to search
+ const int token = vp9_get_token(x);
+ rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token];
+ accu_rate += rate0;
+ x_prev = 0;
+ // Note: accu_error does not change.
+ } else {
+ const int dqv = dequant_ptr[rc != 0];
+ // Compute the distortion for quantizing to 0.
+ const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift);
+ const int diff_for_zero =
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- dx >>= xd->bd - 8;
- }
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8)
+ :
+#endif
+ diff_for_zero_raw;
+ const int64_t distortion_for_zero =
+ (int64_t)diff_for_zero * diff_for_zero;
+
+ // Compute the distortion for the first candidate
+ const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+ const int diff0 =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8)
+ :
#endif // CONFIG_VP9_HIGHBITDEPTH
- d2 = dx * dx;
- tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][0].error = d2 + (best ? error1 : error0);
- tokens[i][0].next = next;
- tokens[i][0].token = t0;
- tokens[i][0].qc = x;
- tokens[i][0].dqc = dqcoeff[rc];
- tokens[i][0].best_index = best;
-
- /* Evaluate the second possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
-
- if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
- (abs(x) * dequant_ptr[rc != 0] <
- (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {
- sz = -(x < 0);
- x -= 2 * sz + 1;
+ diff0_raw;
+ const int64_t distortion0 = (int64_t)diff0 * diff0;
+
+ // Compute the distortion for the second candidate
+ const int sign = -(x < 0); // -1 if x is negative and 0 otherwise.
+ const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1.
+ int64_t distortion1;
+ if (x1 != 0) {
+ const int dqv_step =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8)
+ :
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ dqv;
+ const int diff_step = (dqv_step + sign) ^ sign;
+ const int diff1 = diff0 - diff_step;
+ assert(dqv > 0); // We aren't right shifting a negative number above.
+ distortion1 = (int64_t)diff1 * diff1;
} else {
- tokens[i][1] = tokens[i][0];
- next = i;
-
- if (!(--band_left)) {
- --band_counts;
- band_left = *band_counts;
- --token_costs;
- }
- continue;
+ distortion1 = distortion_for_zero;
}
-
- /* Consider both possible successor states. */
- if (!x) {
- /* If we reduced this coefficient to zero, check to see if
- * we need to move the EOB back here.
- */
- t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
- t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
- base_bits = 0;
- } else {
- base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
- t1 = t0;
+ {
+ // Calculate RDCost for current coeff for the two candidates.
+ const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost);
+ const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost);
+ rate0 =
+ base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0];
+ rate1 =
+ base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1];
}
- if (next < default_eob) {
- if (t0 != EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
- rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
- }
- if (t1 != EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
- rate1 += (*token_costs)[!x][pt][tokens[next][1].token];
+ {
+ int rdcost_better_for_x1, eob_rdcost_better_for_x1;
+ int dqc0, dqc1;
+ int64_t best_eob_cost_cur;
+ int use_x1;
+
+ // Calculate RD Cost effect on the next coeff for the two candidates.
+ int64_t next_bits0 = 0;
+ int64_t next_bits1 = 0;
+ int64_t next_eob_bits0 = 0;
+ int64_t next_eob_bits1 = 0;
+ if (i < default_eob - 1) {
+ int ctx_next, token_tree_sel_next;
+ const int band_next = band_translate[i + 1];
+ const int token_next =
+ (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
+ unsigned int(
+ *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ token_costs + band_next;
+ token_cache[rc] = vp9_pt_energy_class[t0];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x == 0);
+ next_bits0 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+ next_eob_bits0 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+ token_cache[rc] = vp9_pt_energy_class[t1];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x1 == 0);
+ next_bits1 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+ if (x1 != 0) {
+ next_eob_bits1 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+ }
}
- }
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
+ // Compare the total RD costs for two candidates.
+ rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0);
+ rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1);
+ rdcost_better_for_x1 = (rd_cost1 < rd_cost0);
+ eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+ (accu_error + distortion0 - distortion_for_zero));
+ eob_cost1 = eob_cost0;
+ if (x1 != 0) {
+ eob_cost1 =
+ RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+ (accu_error + distortion1 - distortion_for_zero));
+ eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0);
+ } else {
+ eob_rdcost_better_for_x1 = 0;
+ }
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
- } else {
- dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
- }
-#else
- dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif // CONFIG_VP9_HIGHBITDEPTH
- d2 = dx * dx;
-
- tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][1].error = d2 + (best ? error1 : error0);
- tokens[i][1].next = next;
- tokens[i][1].token = best ? t1 : t0;
- tokens[i][1].qc = x;
-
- if (x) {
- tran_low_t offset = dq_step[rc != 0];
- // The 32x32 transform coefficient uses half quantization step size.
- // Account for the rounding difference in the dequantized coefficeint
- // value when the quantization index is dropped from an even number
- // to an odd number.
- if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);
-
- if (sz == 0)
- tokens[i][1].dqc = dqcoeff[rc] - offset;
- else
- tokens[i][1].dqc = dqcoeff[rc] + offset;
- } else {
- tokens[i][1].dqc = 0;
- }
+ // Calculate the two candidate de-quantized values.
+ dqc0 = dqcoeff[rc];
+ dqc1 = 0;
+ if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) {
+ if (x1 != 0) {
+ dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift);
+ } else {
+ dqc1 = 0;
+ }
+ }
- tokens[i][1].best_index = best;
- /* Finally, make this the new head of the trellis. */
- next = i;
- } else {
- /* There's no choice to make for a zero coefficient, so we don't
- * add a new trellis node, but we do need to update the costs.
- */
- pt = get_coef_context(nb, token_cache, i + 1);
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- /* Update the cost of each path if we're past the EOB token. */
- if (t0 != EOB_TOKEN) {
- tokens[next][0].rate += (*token_costs)[1][pt][t0];
- tokens[next][0].token = ZERO_TOKEN;
- }
- if (t1 != EOB_TOKEN) {
- tokens[next][1].rate += (*token_costs)[1][pt][t1];
- tokens[next][1].token = ZERO_TOKEN;
+ // Pick and record the better quantized and de-quantized values.
+ if (rdcost_better_for_x1) {
+ qcoeff[rc] = x1;
+ dqcoeff[rc] = dqc1;
+ accu_rate += rate1;
+ accu_error += distortion1 - distortion_for_zero;
+ assert(distortion1 <= distortion_for_zero);
+ token_cache[rc] = vp9_pt_energy_class[t1];
+ } else {
+ accu_rate += rate0;
+ accu_error += distortion0 - distortion_for_zero;
+ assert(distortion0 <= distortion_for_zero);
+ token_cache[rc] = vp9_pt_energy_class[t0];
+ }
+ assert(accu_error >= 0);
+ x_prev = qcoeff[rc]; // Update based on selected quantized value.
+
+ use_x1 = (x1 != 0) && eob_rdcost_better_for_x1;
+ best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0;
+
+ // Determine whether to move the eob position to i+1
+ if (best_eob_cost_cur < best_block_rd_cost) {
+ best_block_rd_cost = best_eob_cost_cur;
+ final_eob = i + 1;
+ if (use_x1) {
+ before_best_eob_qc = x1;
+ before_best_eob_dqc = dqc1;
+ } else {
+ before_best_eob_qc = x;
+ before_best_eob_dqc = dqc0;
+ }
+ }
}
- tokens[i][0].best_index = tokens[i][1].best_index = 0;
- /* Don't update next, because we didn't add a new node. */
- }
-
- if (!(--band_left)) {
- --band_counts;
- band_left = *band_counts;
- --token_costs;
}
}
-
- /* Now pick the best path through the whole trellis. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- rate0 += (*token_costs)[0][ctx][t0];
- rate1 += (*token_costs)[0][ctx][t1];
- UPDATE_RD_COST();
- best = rd_cost1 < rd_cost0;
- final_eob = -1;
-
- for (i = next; i < eob; i = next) {
- const int x = tokens[i][best].qc;
- const int rc = scan[i];
- if (x) final_eob = i;
- qcoeff[rc] = x;
- dqcoeff[rc] = tokens[i][best].dqc;
- next = tokens[i][best].next;
- best = tokens[i][best].best_index;
+ assert(final_eob <= eob);
+ if (final_eob > 0) {
+ int rc;
+ assert(before_best_eob_qc != 0);
+ i = final_eob - 1;
+ rc = scan[i];
+ qcoeff[rc] = before_best_eob_qc;
+ dqcoeff[rc] = before_best_eob_dqc;
+ }
+ for (i = final_eob; i < eob; i++) {
+ int rc = scan[i];
+ qcoeff[rc] = 0;
+ dqcoeff[rc] = 0;
}
- final_eob++;
-
mb->plane[plane].eobs[block] = final_eob;
return final_eob;
}
+#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
static INLINE void fdct32x32(int rd_transform, const int16_t *src,
tran_low_t *dst, int src_stride) {
@@ -358,6 +333,8 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int16_t *src_diff;
src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -381,7 +358,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
@@ -411,7 +388,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
eob, scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
scan_order->iscan);
@@ -432,6 +409,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int16_t *src_diff;
src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
switch (tx_size) {
@@ -454,7 +434,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
eob);
break;
case TX_4X4:
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
eob);
@@ -482,7 +462,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
qcoeff, dqcoeff, pd->dequant[0], eob);
break;
case TX_4X4:
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
qcoeff, dqcoeff, pd->dequant[0], eob);
break;
@@ -503,6 +483,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int16_t *src_diff;
src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -529,7 +511,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
scan_order->iscan);
break;
case TX_4X4:
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob, scan_order->scan,
@@ -562,7 +544,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
@@ -655,8 +637,8 @@ static void encode_block(int plane, int block, int row, int col,
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
- xd->bd);
+ x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+ xd->bd);
break;
default: assert(0 && "Invalid transform size");
}
@@ -678,7 +660,7 @@ static void encode_block(int plane, int block, int row, int col,
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
break;
default: assert(0 && "Invalid transform size"); break;
}
@@ -700,12 +682,12 @@ static void encode_block_pass1(int plane, int block, int row, int col,
if (p->eobs[block] > 0) {
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
- p->eobs[block], xd->bd);
+ x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+ p->eobs[block], xd->bd);
return;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
- x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
}
}
@@ -799,6 +781,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
(x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
dst_stride, col, row, plane);
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -869,7 +854,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
if (tx_type != DCT_DCT)
vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
else
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob, scan_order->scan,
@@ -883,7 +868,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
// this is like vp9_short_idct4x4 but has a special case around
// eob<=1 which is significant (not just an optimization) for the
// lossless case.
- x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
+ x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
} else {
vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,
xd->bd);
@@ -951,7 +936,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
if (tx_type != DCT_DCT)
vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
else
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
@@ -964,7 +949,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- x->itxm_add(dqcoeff, dst, dst_stride, *eob);
+ x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob);
else
vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
}
diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c
index f57f40dbe..2ae59dd98 100644
--- a/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libvpx/vp9/encoder/vp9_encoder.c
@@ -71,7 +71,6 @@
// mv. Choose a very high value for
// now so that HIGH_PRECISION is always
// chosen.
-// #define OUTPUT_YUV_REC
#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold
#define FRAME_RATE_FACTOR 8
@@ -80,7 +79,7 @@
FILE *yuv_denoised_file = NULL;
#endif
#ifdef OUTPUT_YUV_SKINMAP
-FILE *yuv_skinmap_file = NULL;
+static FILE *yuv_skinmap_file = NULL;
#endif
#ifdef OUTPUT_YUV_REC
FILE *yuv_rec_file;
@@ -438,34 +437,37 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
/* clang-format off */
const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
- { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 },
- { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 },
- { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 },
- { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 },
- { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 },
- { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 },
- { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 },
- { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 },
- { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 },
- { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 },
+ // sample rate size breadth bitrate cpb
+ { LEVEL_1, 829440, 36864, 512, 200, 400, 2, 1, 4, 8 },
+ { LEVEL_1_1, 2764800, 73728, 768, 800, 1000, 2, 1, 4, 8 },
+ { LEVEL_2, 4608000, 122880, 960, 1800, 1500, 2, 1, 4, 8 },
+ { LEVEL_2_1, 9216000, 245760, 1344, 3600, 2800, 2, 2, 4, 8 },
+ { LEVEL_3, 20736000, 552960, 2048, 7200, 6000, 2, 4, 4, 8 },
+ { LEVEL_3_1, 36864000, 983040, 2752, 12000, 10000, 2, 4, 4, 8 },
+ { LEVEL_4, 83558400, 2228224, 4160, 18000, 16000, 4, 4, 4, 8 },
+ { LEVEL_4_1, 160432128, 2228224, 4160, 30000, 18000, 4, 4, 5, 6 },
+ { LEVEL_5, 311951360, 8912896, 8384, 60000, 36000, 6, 8, 6, 4 },
+ { LEVEL_5_1, 588251136, 8912896, 8384, 120000, 46000, 8, 8, 10, 4 },
// TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
// they are finalized (currently tentative).
- { LEVEL_5_2, 1176502272, 8912896, 180000, 90000, 8, 8, 10, 4 },
- { LEVEL_6, 1176502272, 35651584, 180000, 90000, 8, 16, 10, 4 },
- { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 },
- { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 },
+ { LEVEL_5_2, 1176502272, 8912896, 8384, 180000, 90000, 8, 8, 10, 4 },
+ { LEVEL_6, 1176502272, 35651584, 16832, 180000, 90000, 8, 16, 10, 4 },
+ { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
+ { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
};
/* clang-format on */
-static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
- { "The average bit-rate is too high.",
- "The picture size is too large.",
- "The luma sample rate is too large.",
- "The CPB size is too large.",
- "The compression ratio is too small",
- "Too many column tiles are used.",
- "The alt-ref distance is too small.",
- "Too many reference buffers are used." };
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+ "The average bit-rate is too high.",
+ "The picture size is too large.",
+ "The picture width/height is too large.",
+ "The luma sample rate is too large.",
+ "The CPB size is too large.",
+ "The compression ratio is too small",
+ "Too many column tiles are used.",
+ "The alt-ref distance is too small.",
+ "Too many reference buffers are used."
+};
static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
switch (mode) {
@@ -567,6 +569,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
(double)this_level->max_luma_sample_rate *
(1 + SAMPLE_RATE_GRACE_P) ||
level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+ level_spec->max_luma_picture_breadth >
+ this_level->max_luma_picture_breadth ||
level_spec->average_bitrate > this_level->average_bitrate ||
level_spec->max_cpb_size > this_level->max_cpb_size ||
level_spec->compression_ratio < this_level->compression_ratio ||
@@ -739,7 +743,9 @@ void vp9_initialize_enc(void) {
vp9_init_me_luts();
vp9_rc_init_minq_luts();
vp9_entropy_mv_init();
+#if !CONFIG_REALTIME_ONLY
vp9_temporal_filter_init();
+#endif
init_done = 1;
}
}
@@ -779,9 +785,15 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
cpi->nmvsadcosts_hp[0] = NULL;
cpi->nmvsadcosts_hp[1] = NULL;
+ vpx_free(cpi->skin_map);
+ cpi->skin_map = NULL;
+
vpx_free(cpi->prev_partition);
cpi->prev_partition = NULL;
+ vpx_free(cpi->svc.prev_partition_svc);
+ cpi->svc.prev_partition_svc = NULL;
+
vpx_free(cpi->prev_segment_id);
cpi->prev_segment_id = NULL;
@@ -794,6 +806,11 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vpx_free(cpi->content_state_sb_fd);
cpi->content_state_sb_fd = NULL;
+ vpx_free(cpi->count_arf_frame_usage);
+ cpi->count_arf_frame_usage = NULL;
+ vpx_free(cpi->count_lastgolden_frame_usage);
+ cpi->count_lastgolden_frame_usage = NULL;
+
vp9_cyclic_refresh_free(cpi->cyclic_refresh);
cpi->cyclic_refresh = NULL;
@@ -911,6 +928,7 @@ static void restore_coding_context(VP9_COMP *cpi) {
*cm->fc = cc->fc;
}
+#if !CONFIG_REALTIME_ONLY
static void configure_static_seg_features(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const RATE_CONTROL *const rc = &cpi->rc;
@@ -1034,6 +1052,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
}
}
}
+#endif // !CONFIG_REALTIME_ONLY
static void update_reference_segmentation_map(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
@@ -1203,6 +1222,14 @@ static void set_tile_limits(VP9_COMP *cpi) {
clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
cm->log2_tile_rows = cpi->oxcf.tile_rows;
}
+
+ if (cpi->oxcf.target_level == LEVEL_AUTO) {
+ const int level_tile_cols =
+ log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+ if (cm->log2_tile_cols > level_tile_cols) {
+ cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+ }
+ }
}
static void update_frame_size(VP9_COMP *cpi) {
@@ -1318,14 +1345,12 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
}
#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx3f = SDX3F; \
- cpi->fn_ptr[BT].sdx8f = SDX8F; \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
#define MAKE_BFP_SAD_WRAPPER(fnname) \
@@ -1364,47 +1389,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
4; \
}
-#define MAKE_BFP_SAD3_WRAPPER(fnname) \
- static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- unsigned int *sad_array) { \
- fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
- } \
- static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- unsigned int *sad_array) { \
- int i; \
- fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
- for (i = 0; i < 3; i++) sad_array[i] >>= 2; \
- } \
- static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- unsigned int *sad_array) { \
- int i; \
- fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
- for (i = 0; i < 3; i++) sad_array[i] >>= 4; \
- }
-
-#define MAKE_BFP_SAD8_WRAPPER(fnname) \
- static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- unsigned int *sad_array) { \
- fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
- } \
- static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- unsigned int *sad_array) { \
- int i; \
- fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
- for (i = 0; i < 8; i++) sad_array[i] >>= 2; \
- } \
- static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
- const uint8_t *ref_ptr, int ref_stride, \
- unsigned int *sad_array) { \
- int i; \
- fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
- for (i = 0; i < 8; i++) sad_array[i] >>= 4; \
- }
#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
const uint8_t *const ref_ptr[], int ref_stride, \
@@ -1440,46 +1424,30 @@ MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
static void highbd_set_var_fns(VP9_COMP *const cpi) {
@@ -1490,253 +1458,236 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
vpx_highbd_8_sub_pixel_variance32x16,
- vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
+ vpx_highbd_8_sub_pixel_avg_variance32x16,
vpx_highbd_sad32x16x4d_bits8)
HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
vpx_highbd_8_sub_pixel_variance16x32,
- vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
+ vpx_highbd_8_sub_pixel_avg_variance16x32,
vpx_highbd_sad16x32x4d_bits8)
HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
vpx_highbd_8_sub_pixel_variance64x32,
- vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
+ vpx_highbd_8_sub_pixel_avg_variance64x32,
vpx_highbd_sad64x32x4d_bits8)
HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
vpx_highbd_8_sub_pixel_variance32x64,
- vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
+ vpx_highbd_8_sub_pixel_avg_variance32x64,
vpx_highbd_sad32x64x4d_bits8)
HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
vpx_highbd_8_sub_pixel_variance32x32,
vpx_highbd_8_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8,
vpx_highbd_sad32x32x4d_bits8)
HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
vpx_highbd_8_sub_pixel_variance64x64,
vpx_highbd_8_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8,
vpx_highbd_sad64x64x4d_bits8)
HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
vpx_highbd_8_sub_pixel_variance16x16,
vpx_highbd_8_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8,
vpx_highbd_sad16x16x4d_bits8)
- HIGHBD_BFP(
- BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
- vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
- vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8,
- vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8)
+ HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+ vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
+ vpx_highbd_8_sub_pixel_variance16x8,
+ vpx_highbd_8_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits8)
- HIGHBD_BFP(
- BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
- vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
- vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8,
- vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8)
+ HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+ vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
+ vpx_highbd_8_sub_pixel_variance8x16,
+ vpx_highbd_8_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits8)
HIGHBD_BFP(
BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
- vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8,
- vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8)
+ vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
- HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
- vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4,
- vpx_highbd_8_sub_pixel_variance8x4,
- vpx_highbd_8_sub_pixel_avg_variance8x4, NULL,
- vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8)
+ HIGHBD_BFP(
+ BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+ vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+ vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
- HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
- vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8,
- vpx_highbd_8_sub_pixel_variance4x8,
- vpx_highbd_8_sub_pixel_avg_variance4x8, NULL,
- vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8)
+ HIGHBD_BFP(
+ BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+ vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+ vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
HIGHBD_BFP(
BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
- vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8,
- vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8)
+ vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
break;
case VPX_BITS_10:
HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
vpx_highbd_10_sub_pixel_variance32x16,
- vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
+ vpx_highbd_10_sub_pixel_avg_variance32x16,
vpx_highbd_sad32x16x4d_bits10)
HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
vpx_highbd_10_sub_pixel_variance16x32,
- vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
+ vpx_highbd_10_sub_pixel_avg_variance16x32,
vpx_highbd_sad16x32x4d_bits10)
HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
vpx_highbd_10_sub_pixel_variance64x32,
- vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
+ vpx_highbd_10_sub_pixel_avg_variance64x32,
vpx_highbd_sad64x32x4d_bits10)
HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
vpx_highbd_10_sub_pixel_variance32x64,
- vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
+ vpx_highbd_10_sub_pixel_avg_variance32x64,
vpx_highbd_sad32x64x4d_bits10)
HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
vpx_highbd_10_sub_pixel_variance32x32,
vpx_highbd_10_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10,
vpx_highbd_sad32x32x4d_bits10)
HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
vpx_highbd_10_sub_pixel_variance64x64,
vpx_highbd_10_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10,
vpx_highbd_sad64x64x4d_bits10)
HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
vpx_highbd_10_sub_pixel_variance16x16,
vpx_highbd_10_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10,
vpx_highbd_sad16x16x4d_bits10)
HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
vpx_highbd_10_sub_pixel_variance16x8,
vpx_highbd_10_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10,
vpx_highbd_sad16x8x4d_bits10)
HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
vpx_highbd_10_sub_pixel_variance8x16,
vpx_highbd_10_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10,
vpx_highbd_sad8x16x4d_bits10)
- HIGHBD_BFP(
- BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10,
- vpx_highbd_10_variance8x8, vpx_highbd_10_sub_pixel_variance8x8,
- vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10,
- vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10)
+ HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
+ vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+ vpx_highbd_10_sub_pixel_variance8x8,
+ vpx_highbd_10_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits10)
HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
vpx_highbd_10_sub_pixel_variance8x4,
- vpx_highbd_10_sub_pixel_avg_variance8x4, NULL,
- vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10)
+ vpx_highbd_10_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits10)
HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
vpx_highbd_10_sub_pixel_variance4x8,
- vpx_highbd_10_sub_pixel_avg_variance4x8, NULL,
- vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10)
-
- HIGHBD_BFP(
- BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10,
- vpx_highbd_10_variance4x4, vpx_highbd_10_sub_pixel_variance4x4,
- vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10,
- vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10)
+ vpx_highbd_10_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
+ vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+ vpx_highbd_10_sub_pixel_variance4x4,
+ vpx_highbd_10_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits10)
break;
case VPX_BITS_12:
HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
vpx_highbd_12_sub_pixel_variance32x16,
- vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
+ vpx_highbd_12_sub_pixel_avg_variance32x16,
vpx_highbd_sad32x16x4d_bits12)
HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
vpx_highbd_12_sub_pixel_variance16x32,
- vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
+ vpx_highbd_12_sub_pixel_avg_variance16x32,
vpx_highbd_sad16x32x4d_bits12)
HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
vpx_highbd_12_sub_pixel_variance64x32,
- vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
+ vpx_highbd_12_sub_pixel_avg_variance64x32,
vpx_highbd_sad64x32x4d_bits12)
HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
vpx_highbd_12_sub_pixel_variance32x64,
- vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
+ vpx_highbd_12_sub_pixel_avg_variance32x64,
vpx_highbd_sad32x64x4d_bits12)
HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
vpx_highbd_12_sub_pixel_variance32x32,
vpx_highbd_12_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12,
vpx_highbd_sad32x32x4d_bits12)
HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
vpx_highbd_12_sub_pixel_variance64x64,
vpx_highbd_12_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12,
vpx_highbd_sad64x64x4d_bits12)
HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
vpx_highbd_12_sub_pixel_variance16x16,
vpx_highbd_12_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12,
vpx_highbd_sad16x16x4d_bits12)
HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
vpx_highbd_12_sub_pixel_variance16x8,
vpx_highbd_12_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12,
vpx_highbd_sad16x8x4d_bits12)
HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
vpx_highbd_12_sub_pixel_variance8x16,
vpx_highbd_12_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12,
vpx_highbd_sad8x16x4d_bits12)
- HIGHBD_BFP(
- BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12,
- vpx_highbd_12_variance8x8, vpx_highbd_12_sub_pixel_variance8x8,
- vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12,
- vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12)
+ HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
+ vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+ vpx_highbd_12_sub_pixel_variance8x8,
+ vpx_highbd_12_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits12)
HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
vpx_highbd_12_sub_pixel_variance8x4,
- vpx_highbd_12_sub_pixel_avg_variance8x4, NULL,
- vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12)
+ vpx_highbd_12_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits12)
HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
vpx_highbd_12_sub_pixel_variance4x8,
- vpx_highbd_12_sub_pixel_avg_variance4x8, NULL,
- vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12)
-
- HIGHBD_BFP(
- BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12,
- vpx_highbd_12_variance4x4, vpx_highbd_12_sub_pixel_variance4x4,
- vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12,
- vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12)
+ vpx_highbd_12_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
+ vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+ vpx_highbd_12_sub_pixel_variance4x4,
+ vpx_highbd_12_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits12)
break;
default:
@@ -1902,6 +1853,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
vp9_cyclic_refresh_reset_resize(cpi);
+ rc->rc_1_frame = 0;
+ rc->rc_2_frame = 0;
}
if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
@@ -1912,6 +1865,24 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
(int)cpi->oxcf.target_bandwidth);
}
+ // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
+ // configuration change has a large change in avg_frame_bandwidth.
+ // For SVC check for resetting based on spatial layer average bandwidth.
+ // Also reset buffer level to optimal level.
+ if (cm->current_video_frame > 0) {
+ if (cpi->use_svc) {
+ vp9_svc_check_reset_layer_rc_flag(cpi);
+ } else {
+ if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) ||
+ rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
+ rc->rc_1_frame = 0;
+ rc->rc_2_frame = 0;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->buffer_level = rc->optimal_buffer_level;
+ }
+ }
+ }
+
cpi->alt_ref_source = NULL;
rc->is_src_frame_alt_ref = 0;
@@ -2046,6 +2017,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
realloc_segmentation_maps(cpi);
+ CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
+ sizeof(cpi->skin_map[0])));
+
CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
CHECK_MEM_ERROR(
@@ -2162,7 +2136,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
#endif
#endif
#ifdef OUTPUT_YUV_SKINMAP
- yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+ yuv_skinmap_file = fopen("skinmap.yuv", "wb");
#endif
#ifdef OUTPUT_YUV_REC
yuv_rec_file = fopen("rec.yuv", "wb");
@@ -2175,6 +2149,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+#if !CONFIG_REALTIME_ONLY
if (oxcf->pass == 1) {
vp9_init_first_pass(cpi);
} else if (oxcf->pass == 2) {
@@ -2239,6 +2214,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
vp9_init_second_pass(cpi);
}
}
+#endif // !CONFIG_REALTIME_ONLY
vp9_set_speed_features_framesize_independent(cpi);
vp9_set_speed_features_framesize_dependent(cpi);
@@ -2248,67 +2224,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi->source_var_thresh = 0;
cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx3f = SDX3F; \
- cpi->fn_ptr[BT].sdx8f = SDX8F; \
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
- vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, NULL, NULL,
+ vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
vpx_sad32x16x4d)
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
- vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, NULL, NULL,
+ vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
vpx_sad16x32x4d)
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
- vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, NULL, NULL,
+ vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
vpx_sad64x32x4d)
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
- vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, NULL, NULL,
+ vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
vpx_sad32x64x4d)
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
- vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d)
+ vpx_sad32x32x4d)
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
- vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d)
+ vpx_sad64x64x4d)
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
- vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d)
+ vpx_sad16x16x4d)
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
- vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3,
- vpx_sad16x8x8, vpx_sad16x8x4d)
+ vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
+ vpx_sad16x8x4d)
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
- vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3,
- vpx_sad8x16x8, vpx_sad8x16x4d)
+ vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
+ vpx_sad8x16x4d)
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
- vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3,
- vpx_sad8x8x8, vpx_sad8x8x4d)
+ vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
- vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, NULL,
- vpx_sad8x4x8, vpx_sad8x4x4d)
+ vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
- vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, NULL,
- vpx_sad4x8x8, vpx_sad4x8x4d)
+ vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
- vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3,
- vpx_sad4x4x8, vpx_sad4x4x4d)
+ vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
@@ -2375,16 +2345,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
snprintf(headings, sizeof(headings),
"Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
"VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
- "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+ "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+ "AVPsnrY\tAPsnrCb\tAPsnrCr");
snprintf(results, sizeof(results),
"%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
"%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
- "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f",
dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim,
totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count,
cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
- cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+ cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+ cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count,
+ cpi->psnr.stat[V] / cpi->count);
if (cpi->b_calculate_blockiness) {
SNPRINT(headings, "\t Block\tWstBlck");
@@ -2557,7 +2531,7 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
YV12_BUFFER_CONFIG *sd) {
YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
if (cfg) {
- vp8_yv12_copy_frame(cfg, sd);
+ vpx_yv12_copy_frame(cfg, sd);
return 0;
} else {
return -1;
@@ -2568,7 +2542,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
YV12_BUFFER_CONFIG *sd) {
YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
if (cfg) {
- vp8_yv12_copy_frame(sd, cfg);
+ vpx_yv12_copy_frame(sd, cfg);
return 0;
} else {
return -1;
@@ -2581,38 +2555,6 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) {
return 0;
}
-#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
-// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
-// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
-// not denoise the UV channels at this time. If ever we implement UV channel
-// denoising we will have to modify this.
-void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
- uint8_t *src = s->y_buffer;
- int h = s->y_height;
-
- do {
- fwrite(src, s->y_width, 1, f);
- src += s->y_stride;
- } while (--h);
-
- src = s->u_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, f);
- src += s->uv_stride;
- } while (--h);
-
- src = s->v_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, f);
- src += s->uv_stride;
- } while (--h);
-}
-#endif
-
#ifdef OUTPUT_YUV_REC
void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
YV12_BUFFER_CONFIG *s = cm->frame_to_show;
@@ -2748,15 +2690,14 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
- CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
- kernel[x_q4 & 0xf], 16 * src_w / dst_w,
- kernel[y_q4 & 0xf], 16 * src_h / dst_h,
- 16 / factor, 16 / factor, bd);
+ CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16 / factor, 16 / factor,
+ bd);
} else {
- vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
- kernel[x_q4 & 0xf], 16 * src_w / dst_w,
- kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
- 16 / factor);
+ vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16 / factor, 16 / factor);
}
}
}
@@ -2782,11 +2723,33 @@ static int scale_down(VP9_COMP *cpi, int q) {
return scale;
}
-static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) {
+static int big_rate_miss_high_threshold(VP9_COMP *cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
+ int big_miss_high;
- return (rc->projected_frame_size > ((high_limit * 3) / 2)) ||
- (rc->projected_frame_size < (low_limit / 2));
+ if (frame_is_kf_gf_arf(cpi))
+ big_miss_high = rc->this_frame_target * 3 / 2;
+ else
+ big_miss_high = rc->this_frame_target * 2;
+
+ return big_miss_high;
+}
+
+static int big_rate_miss(VP9_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int big_miss_high;
+ int big_miss_low;
+
+ // Ignore for overlay frames
+ if (rc->is_src_frame_alt_ref) {
+ return 0;
+ } else {
+ big_miss_low = (rc->this_frame_target / 2);
+ big_miss_high = big_rate_miss_high_threshold(cpi);
+
+ return (rc->projected_frame_size > big_miss_high) ||
+ (rc->projected_frame_size < big_miss_low);
+ }
}
// test in two pass for the first
@@ -2811,8 +2774,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
int force_recode = 0;
if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
- big_rate_miss(cpi, high_limit, low_limit) ||
- (cpi->sf.recode_loop == ALLOW_RECODE) ||
+ big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) ||
(two_pass_first_group_inter(cpi) &&
(cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) ||
(frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) {
@@ -2822,8 +2784,13 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
cpi->resize_pending = 1;
return 1;
}
- // Force recode if projected_frame_size > max_frame_bandwidth
- if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1;
+
+ // Force recode for extreme overshoot.
+ if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+ rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) {
+ return 1;
+ }
// TODO(agrange) high_limit could be greater than the scale-down threshold.
if ((rc->projected_frame_size > high_limit && q < maxq) ||
@@ -2914,17 +2881,38 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
cpi->denoiser.denoising_level > kDenLowLow) {
int svc_base_is_key = 0;
+ int denoise_svc_second_layer = 0;
if (cpi->use_svc) {
+ int realloc_fail = 0;
+ const int svc_buf_shift =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+ ? cpi->denoiser.num_ref_frames
+ : 0;
int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
cpi->svc.temporal_layer_id,
cpi->svc.number_temporal_layers);
LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
svc_base_is_key = lc->is_key_frame;
+ denoise_svc_second_layer =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1
+ : 0;
+ // Check if we need to allocate extra buffers in the denoiser
+ // for
+ // refreshed frames.
+ realloc_fail = vp9_denoiser_realloc_svc(
+ cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame,
+ cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx,
+ cpi->gld_fb_idx, cpi->lst_fb_idx);
+ if (realloc_fail)
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to re-allocate denoiser for SVC");
}
vp9_denoiser_update_frame_info(
&cpi->denoiser, *cpi->Source, cpi->common.frame_type,
cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
- cpi->refresh_last_frame, cpi->resize_pending, svc_base_is_key);
+ cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
+ cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key,
+ denoise_svc_second_layer);
}
#endif
if (is_one_pass_cbr_svc(cpi)) {
@@ -3195,15 +3183,37 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
dc_quant_devisor = 4.0;
#endif
- fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
- "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
- "%10"PRId64" %10"PRId64" %10d "
- "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
- "%6d %6d %5d %5d %5d "
- "%10"PRId64" %10.3lf"
- "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
+ if (!cm->current_video_frame) {
+ fprintf(f, "frame, width, height, last ts, last end ts, "
+ "source_alt_ref_pending, source_alt_ref_active, "
+ "this_frame_target, projected_frame_size, "
+ "projected_frame_size / MBs, "
+ "projected_frame_size - this_frame_target, "
+ "vbr_bits_off_target, vbr_bits_off_target_fast, "
+ "twopass.extend_minq, twopass.extend_minq_fast, "
+ "total_target_vs_actual, "
+ "starting_buffer_level - bits_off_target, "
+ "total_actual_bits, base_qindex, q for base_qindex, "
+ "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, "
+ "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, "
+ "frame_type, gfu_boost, "
+ "twopass.bits_left, "
+ "twopass.total_left_stats.coded_error, "
+ "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), "
+ "tot_recode_hits, recon_err, kf_boost, "
+ "twopass.kf_zeromotion_pct, twopass.fr_content_type, "
+ "filter_level, seg.aq_av_offset\n");
+ }
+
+ fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, "
+ "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", "
+ "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, "
+ "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, "
+ "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n",
cpi->common.current_video_frame,
cm->width, cm->height,
+ cpi->last_time_stamp_seen,
+ cpi->last_end_time_stamp_seen,
cpi->rc.source_alt_ref_pending,
cpi->rc.source_alt_ref_active,
cpi->rc.this_frame_target,
@@ -3291,7 +3301,6 @@ static void set_size_independent_vars(VP9_COMP *cpi) {
static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
int *top_index) {
VP9_COMMON *const cm = &cpi->common;
- const VP9EncoderConfig *const oxcf = &cpi->oxcf;
// Setup variables that depend on the dimensions of the frame.
vp9_set_speed_features_framesize_dependent(cpi);
@@ -3303,17 +3312,19 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
}
+#if !CONFIG_REALTIME_ONLY
// Configure experimental use of segmentation for enhanced coding of
// static regions if indicated.
// Only allowed in the second pass of a two pass encode, as it requires
// lagged coding, and if the relevant speed feature flag is set.
- if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+ if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
configure_static_seg_features(cpi);
+#endif // !CONFIG_REALTIME_ONLY
#if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING)
- if (oxcf->noise_sensitivity > 0) {
+ if (cpi->oxcf.noise_sensitivity > 0) {
int l = 0;
- switch (oxcf->noise_sensitivity) {
+ switch (cpi->oxcf.noise_sensitivity) {
case 1: l = 20; break;
case 2: l = 40; break;
case 3: l = 60; break;
@@ -3336,7 +3347,8 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
if (cpi->oxcf.noise_sensitivity > 0 &&
!cpi->denoiser.frame_buffer_initialized) {
- if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height,
+ if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+ cpi->oxcf.noise_sensitivity, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
@@ -3364,6 +3376,7 @@ static void set_frame_size(VP9_COMP *cpi) {
VP9EncoderConfig *const oxcf = &cpi->oxcf;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+#if !CONFIG_REALTIME_ONLY
if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR &&
((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
(oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
@@ -3374,6 +3387,7 @@ static void set_frame_size(VP9_COMP *cpi) {
vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
oxcf->scaled_frame_height);
}
+#endif // !CONFIG_REALTIME_ONLY
if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc &&
oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) {
@@ -3466,8 +3480,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
// Flag to check if its valid to compute the source sad (used for
// scene detection and for superblock content state in CBR mode).
// The flag may get reset below based on SVC or resizing state.
- cpi->compute_source_sad_onepass =
- cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cm->show_frame;
+ cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME;
vpx_clear_system_state();
@@ -3518,6 +3531,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
if ((cpi->use_svc &&
(cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 ||
+ cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 ||
cpi->svc.current_superframe < 1)) ||
cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
cpi->resize_state != ORIG) {
@@ -3555,12 +3569,14 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
vp9_update_noise_estimate(cpi);
- // Scene detection is used for VBR mode or screen-content case.
- // Make sure compute_source_sad_onepass is set (which handles SVC case
- // and dynamic resize).
- if (cpi->compute_source_sad_onepass &&
+ // Scene detection is always used for VBR mode or screen-content case.
+ // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now
+ // (need to check encoding time cost for doing this for speed 8).
+ cpi->rc.high_source_sad = 0;
+ if (cpi->compute_source_sad_onepass && cm->show_frame &&
(cpi->oxcf.rc_mode == VPX_VBR ||
- cpi->oxcf.content == VP9E_CONTENT_SCREEN))
+ cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+ (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc)))
vp9_scene_detection_onepass(cpi);
// For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
@@ -3576,6 +3592,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi);
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) {
+ if (cpi->svc.prev_partition_svc == NULL) {
+ CHECK_MEM_ERROR(
+ cm, cpi->svc.prev_partition_svc,
+ (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+ sizeof(*cpi->svc.prev_partition_svc)));
+ }
+ }
+
if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
cpi->oxcf.rc_mode == VPX_CBR &&
cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
@@ -3660,6 +3686,7 @@ static int get_qstep_adj(int rate_excess, int rate_limit) {
static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
uint8_t *dest) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
int bottom_index, top_index;
@@ -3696,9 +3723,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2);
bottom_index =
- VPXMAX(bottom_index - qrange_adj / 2, cpi->oxcf.best_allowed_q);
- top_index =
- VPXMIN(cpi->oxcf.worst_allowed_q, top_index + qrange_adj / 2);
+ VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q);
+ top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2);
}
#endif
// TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
@@ -3726,7 +3752,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
cpi->Source =
vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
- (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+ (oxcf->pass == 0), EIGHTTAP, 0);
// Unfiltered raw source used in metrics calculation if the source
// has been filtered.
@@ -3735,7 +3761,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
if (is_spatial_denoise_enabled(cpi)) {
cpi->raw_source_frame = vp9_scale_if_required(
cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
- (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+ (oxcf->pass == 0), EIGHTTAP, 0);
} else {
cpi->raw_source_frame = cpi->Source;
}
@@ -3745,9 +3771,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
}
if (cpi->unscaled_last_source != NULL)
- cpi->Last_Source = vp9_scale_if_required(
- cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
- (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+ cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source,
+ (oxcf->pass == 0), EIGHTTAP, 0);
if (frame_is_intra_only(cm) == 0) {
if (loop_count > 0) {
@@ -3762,13 +3788,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
// Variance adaptive and in frame q adjustment experiments are mutually
// exclusive.
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ if (oxcf->aq_mode == VARIANCE_AQ) {
vp9_vaq_frame_setup(cpi);
- } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+ } else if (oxcf->aq_mode == EQUATOR360_AQ) {
vp9_360aq_frame_setup(cpi);
- } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ } else if (oxcf->aq_mode == COMPLEXITY_AQ) {
vp9_setup_in_frame_q_adj(cpi);
- } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) {
+ } else if (oxcf->aq_mode == LOOKAHEAD_AQ) {
vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
}
@@ -3792,7 +3818,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
}
- if (cpi->oxcf.rc_mode == VPX_Q) {
+ if (oxcf->rc_mode == VPX_Q) {
loop = 0;
} else {
if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
@@ -3872,11 +3898,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
// Frame is too large
if (rc->projected_frame_size > rc->this_frame_target) {
// Special case if the projected size is > the max allowed.
- if (rc->projected_frame_size >= rc->max_frame_bandwidth) {
+ if ((q == q_high) &&
+ ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (rc->projected_frame_size >=
+ big_rate_miss_high_threshold(cpi)))) {
+ int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth,
+ big_rate_miss_high_threshold(cpi)));
double q_val_high;
q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth);
- q_val_high = q_val_high * ((double)rc->projected_frame_size /
- rc->max_frame_bandwidth);
+ q_val_high =
+ q_val_high * ((double)rc->projected_frame_size / max_rate);
q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth);
q_high = clamp(q_high, rc->best_quality, rc->worst_quality);
}
@@ -3885,7 +3916,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
qstep =
get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
q_low = VPXMIN(q + qstep, q_high);
- // q_low = q < q_high ? q + 1 : q_high;
if (undershoot_seen || loop_at_this_size > 1) {
// Update rate_correction_factor unless
@@ -3913,31 +3943,29 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
qstep =
get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
q_high = VPXMAX(q - qstep, q_low);
- // q_high = q > q_low ? q - 1 : q_low;
if (overshoot_seen || loop_at_this_size > 1) {
vp9_rc_update_rate_correction_factors(cpi);
q = (q_high + q_low) / 2;
} else {
vp9_rc_update_rate_correction_factors(cpi);
- q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
- top_index);
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+ VPXMIN(q_low, bottom_index), top_index);
// Special case reset for qlow for constrained quality.
// This should only trigger where there is very substantial
// undershoot on a frame and the auto cq level is above
// the user passsed in value.
- if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) {
+ if (oxcf->rc_mode == VPX_CQ && q < q_low) {
q_low = q;
}
while (q > q_high && retries < 10) {
vp9_rc_update_rate_correction_factors(cpi);
- q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
- top_index);
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+ VPXMIN(q_low, bottom_index), top_index);
retries++;
}
}
-
undershoot_seen = 1;
}
@@ -3971,9 +3999,21 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
#ifdef AGGRESSIVE_VBR
if (two_pass_first_group_inter(cpi)) {
cpi->twopass.active_worst_quality =
- VPXMIN(q + qrange_adj, cpi->oxcf.worst_allowed_q);
- }
+ VPXMIN(q + qrange_adj, oxcf->worst_allowed_q);
+ } else if (!frame_is_kf_gf_arf(cpi)) {
+#else
+ if (!frame_is_kf_gf_arf(cpi)) {
#endif
+ // Have we been forced to adapt Q outside the expected range by an extreme
+ // rate miss. If so adjust the active maxQ for the subsequent frames.
+ if (q > cpi->twopass.active_worst_quality) {
+ cpi->twopass.active_worst_quality = q;
+ } else if (oxcf->vbr_corpus_complexity && q == q_low &&
+ rc->projected_frame_size < rc->this_frame_target) {
+ cpi->twopass.active_worst_quality =
+ VPXMAX(q, cpi->twopass.active_worst_quality - 1);
+ }
+ }
if (enable_acl) {
// Skip recoding, if model diff is below threshold
@@ -4448,14 +4488,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
#if CONFIG_VP9_TEMPORAL_DENOISING
#ifdef OUTPUT_YUV_DENOISED
if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
- vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
- yuv_denoised_file);
+ vpx_write_yuv_frame(yuv_denoised_file,
+ &cpi->denoiser.running_avg_y[INTRA_FRAME]);
}
#endif
#endif
#ifdef OUTPUT_YUV_SKINMAP
if (cpi->common.current_video_frame > 1) {
- vp9_compute_skin_map(cpi, yuv_skinmap_file);
+ vp9_output_skin_map(cpi, yuv_skinmap_file);
}
#endif
@@ -4592,6 +4632,7 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
encode_frame_to_data_rate(cpi, size, dest, frame_flags);
}
+#if !CONFIG_REALTIME_ONLY
static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
unsigned int *frame_flags) {
cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
@@ -4600,6 +4641,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
vp9_twopass_postencode_update(cpi);
}
+#endif // !CONFIG_REALTIME_ONLY
static void init_ref_frame_bufs(VP9_COMMON *cm) {
int i;
@@ -4822,6 +4864,7 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
int i, idx;
uint64_t luma_samples, dur_end;
const uint32_t luma_pic_size = cm->width * cm->height;
+ const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
LevelConstraint *const level_constraint = &cpi->level_constraint;
const int8_t level_index = level_constraint->level_index;
double cpb_data_size;
@@ -4925,6 +4968,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
level_spec->max_luma_picture_size = luma_pic_size;
}
+ // update max_luma_picture_breadth
+ if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
+ level_spec->max_luma_picture_breadth = luma_pic_breadth;
+ }
+
// update compression_ratio
level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
cm->bit_depth /
@@ -4945,6 +4993,15 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
}
+ if (level_spec->max_luma_picture_breadth >
+ vp9_level_defs[level_index].max_luma_picture_breadth) {
+ level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
+ }
+
if ((double)level_spec->max_luma_sample_rate >
(double)vp9_level_defs[level_index].max_luma_sample_rate *
(1 + SAMPLE_RATE_GRACE_P)) {
@@ -5094,7 +5151,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
#endif
-
+#if !CONFIG_REALTIME_ONLY
if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
(oxcf->arnr_strength > 0)) {
int bitrate = cpi->rc.avg_frame_bandwidth / 40;
@@ -5114,7 +5171,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
force_src_buffer = &cpi->alt_ref_buffer;
}
-
+#endif
cm->show_frame = 0;
cm->intra_only = 0;
cpi->refresh_alt_ref_frame = 1;
@@ -5145,8 +5202,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
cm->intra_only = 0;
// if the flags indicate intra frame, but if the current picture is for
// non-zero spatial layer, it should not be an intra picture.
- // TODO(Won Kap): this needs to change if per-layer intra frame is
- // allowed.
if ((source->flags & VPX_EFLAG_FORCE_KF) &&
cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
@@ -5175,10 +5230,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
} else {
*size = 0;
+#if !CONFIG_REALTIME_ONLY
if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
vp9_end_first_pass(cpi); /* get last stats packet */
cpi->twopass.first_pass_done = 1;
}
+#endif // !CONFIG_REALTIME_ONLY
return -1;
}
@@ -5225,6 +5282,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
cpi->frame_flags = *frame_flags;
+#if !CONFIG_REALTIME_ONLY
if ((oxcf->pass == 2) &&
(!cpi->use_svc || (is_two_pass_svc(cpi) &&
cpi->svc.encode_empty_frame_state != ENCODING))) {
@@ -5232,6 +5290,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
} else if (oxcf->pass == 1) {
set_frame_size(cpi);
}
+#endif // !CONFIG_REALTIME_ONLY
if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
cpi->level_constraint.fail_flag == 0)
@@ -5242,20 +5301,28 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
cpi->td.mb.fp_src_pred = 0;
+#if CONFIG_REALTIME_ONLY
+ if (cpi->use_svc) {
+ SvcEncode(cpi, size, dest, frame_flags);
+ } else {
+ // One pass encode
+ Pass0Encode(cpi, size, dest, frame_flags);
+ }
+#else // !CONFIG_REALTIME_ONLY
if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
const int lossless = is_lossless_requested(oxcf);
#if CONFIG_VP9_HIGHBITDEPTH
if (cpi->oxcf.use_highbitdepth)
- cpi->td.mb.fwd_txm4x4 =
+ cpi->td.mb.fwd_txfm4x4 =
lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
else
- cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
- cpi->td.mb.highbd_itxm_add =
+ cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+ cpi->td.mb.highbd_inv_txfm_add =
lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
#else
- cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+ cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
#endif // CONFIG_VP9_HIGHBITDEPTH
- cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+ cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
vp9_first_pass(cpi, source);
} else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
Pass2Encode(cpi, size, dest, frame_flags);
@@ -5265,6 +5332,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
// One pass encode
Pass0Encode(cpi, size, dest, frame_flags);
}
+#endif // CONFIG_REALTIME_ONLY
if (cm->refresh_frame_context)
cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
@@ -5631,7 +5699,7 @@ void vp9_set_row_mt(VP9_COMP *cpi) {
cpi->row_mt = 1;
}
- if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+ if (cpi->row_mt)
cpi->row_mt_bit_exact = 1;
else
cpi->row_mt_bit_exact = 0;
diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h
index 672c83bfd..d723d93cb 100644
--- a/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libvpx/vp9/encoder/vp9_encoder.h
@@ -138,6 +138,7 @@ typedef enum {
kHighSadLowSumdiff = 3,
kHighSadHighSumdiff = 4,
kLowVarHighSumdiff = 5,
+ kVeryHighSad = 6,
} CONTENT_STATE_SB;
typedef struct VP9EncoderConfig {
@@ -208,6 +209,7 @@ typedef struct VP9EncoderConfig {
int two_pass_vbrbias; // two pass datarate control tweaks
int two_pass_vbrmin_section;
int two_pass_vbrmax_section;
+ int vbr_corpus_complexity; // 0 indicates corpus vbr disabled
// END DATARATE CONTROL OPTIONS
// ----------------------------------------------------------------
@@ -359,6 +361,7 @@ typedef struct IMAGE_STAT {
typedef enum {
LEVEL_UNKNOWN = 0,
+ LEVEL_AUTO = 1,
LEVEL_1 = 10,
LEVEL_1_1 = 11,
LEVEL_2 = 20,
@@ -380,6 +383,7 @@ typedef struct {
VP9_LEVEL level;
uint64_t max_luma_sample_rate;
uint32_t max_luma_picture_size;
+ uint32_t max_luma_picture_breadth;
double average_bitrate; // in kilobits per second
double max_cpb_size; // in kilobits
double compression_ratio;
@@ -419,14 +423,15 @@ typedef struct {
typedef enum {
BITRATE_TOO_LARGE = 0,
- LUMA_PIC_SIZE_TOO_LARGE = 1,
- LUMA_SAMPLE_RATE_TOO_LARGE = 2,
- CPB_TOO_LARGE = 3,
- COMPRESSION_RATIO_TOO_SMALL = 4,
- TOO_MANY_COLUMN_TILE = 5,
- ALTREF_DIST_TOO_SMALL = 6,
- TOO_MANY_REF_BUFFER = 7,
- TARGET_LEVEL_FAIL_IDS = 8
+ LUMA_PIC_SIZE_TOO_LARGE,
+ LUMA_PIC_BREADTH_TOO_LARGE,
+ LUMA_SAMPLE_RATE_TOO_LARGE,
+ CPB_TOO_LARGE,
+ COMPRESSION_RATIO_TOO_SMALL,
+ TOO_MANY_COLUMN_TILE,
+ ALTREF_DIST_TOO_SMALL,
+ TOO_MANY_REF_BUFFER,
+ TARGET_LEVEL_FAIL_IDS
} TARGET_LEVEL_FAIL_ID;
typedef struct {
@@ -541,6 +546,8 @@ typedef struct VP9_COMP {
uint8_t *segmentation_map;
+ uint8_t *skin_map;
+
// segment threashold for encode breakout
int segment_encode_breakout[MAX_SEGMENTS];
@@ -548,7 +555,6 @@ typedef struct VP9_COMP {
ActiveMap active_map;
fractional_mv_step_fp *find_fractional_mv_step;
- vp9_full_search_fn_t full_search_sad;
vp9_diamond_search_fn_t diamond_search_sad;
vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
uint64_t time_receive_data;
@@ -714,6 +720,9 @@ typedef struct VP9_COMP {
int compute_source_sad_onepass;
LevelConstraint level_constraint;
+
+ uint8_t *count_arf_frame_usage;
+ uint8_t *count_lastgolden_frame_usage;
} VP9_COMP;
void vp9_initialize_enc(void);
@@ -861,13 +870,14 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
return (!cpi->use_svc ||
(cpi->use_svc &&
- cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+ cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
}
#endif
+#define MIN_LOOKAHEAD_FOR_ARFS 4
static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) &&
- cpi->oxcf.lag_in_frames > 0 &&
+ cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS &&
(cpi->oxcf.enable_auto_arf &&
(!is_two_pass_svc(cpi) ||
cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
@@ -910,6 +920,22 @@ static INLINE int get_level_index(VP9_LEVEL level) {
return -1;
}
+// Return the log2 value of max column tiles corresponding to the level that
+// the picture size fits into.
+static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
+ uint32_t height) {
+ int i;
+ const uint32_t pic_size = width * height;
+ const uint32_t pic_breadth = VPXMAX(width, height);
+ for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+ if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+ vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+ return get_msb(vp9_level_defs[i].max_col_tiles);
+ }
+ }
+ return INT_MAX;
+}
+
VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
void vp9_new_framerate(VP9_COMP *cpi, double framerate);
diff --git a/libvpx/vp9/encoder/vp9_ethread.c b/libvpx/vp9/encoder/vp9_ethread.c
index 51664112a..0bd2e2145 100644
--- a/libvpx/vp9/encoder/vp9_ethread.c
+++ b/libvpx/vp9/encoder/vp9_ethread.c
@@ -35,7 +35,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
td_t->rd_counts.coef_counts[i][j][k][l][m][n];
}
-static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+static int enc_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
VP9_COMP *const cpi = thread_data->cpi;
const VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
@@ -64,6 +65,13 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
log2_tile_cols =
clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+ if (cpi->oxcf.target_level == LEVEL_AUTO) {
+ const int level_tile_cols =
+ log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+ if (log2_tile_cols > level_tile_cols) {
+ log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+ }
+ }
return (1 << log2_tile_cols);
}
@@ -135,7 +143,7 @@ static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
- worker->hook = (VPxWorkerHook)hook;
+ worker->hook = hook;
worker->data1 = &cpi->tile_thr_data[i];
worker->data2 = data2;
}
@@ -203,7 +211,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
}
}
- launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers);
+ launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers);
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
@@ -217,6 +225,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
}
}
+#if !CONFIG_REALTIME_ONLY
static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
TileDataEnc *tile_data_t) {
tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
@@ -251,6 +260,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
: VPXMIN(tile_data->fp_data.image_data_start_row,
tile_data_t->fp_data.image_data_start_row);
}
+#endif // !CONFIG_REALTIME_ONLY
// Allocate memory for row synchronization
void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
@@ -379,6 +389,7 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
return;
}
+#if !CONFIG_REALTIME_ONLY
static int first_pass_worker_hook(EncWorkerData *const thread_data,
MultiThreadHandle *multi_thread_ctxt) {
VP9_COMP *const cpi = thread_data->cpi;
@@ -545,6 +556,7 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
multi_thread_ctxt, num_workers);
}
+#endif // !CONFIG_REALTIME_ONLY
static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
MultiThreadHandle *multi_thread_ctxt) {
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index b6e327548..fb6b132a5 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -41,9 +41,9 @@
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
+#define COMPLEXITY_STATS_OUTPUT 0
#define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
#define INTRA_MODE_PENALTY 1024
#define MIN_ARF_GF_BOOST 240
#define MIN_DECAY_FACTOR 0.01
@@ -103,7 +103,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
fpfile = fopen("firstpass.stt", "a");
fprintf(fpfile,
- "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+ "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf"
"%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
"%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf"
"%12.4lf"
@@ -235,16 +235,25 @@ static double calculate_active_area(const VP9_COMP *cpi,
return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
}
+// Get the average weighted error for the clip (or corpus)
+static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) {
+ const double av_weight =
+ twopass->total_stats.weight / twopass->total_stats.count;
+
+ if (cpi->oxcf.vbr_corpus_complexity)
+ return av_weight * twopass->mean_mod_score;
+ else
+ return (twopass->total_stats.coded_error * av_weight) /
+ twopass->total_stats.count;
+}
+
+#define ACT_AREA_CORRECTION 0.5
// Calculate a modified Error used in distributing bits between easier and
// harder frames.
-#define ACT_AREA_CORRECTION 0.5
static double calculate_mod_frame_score(const VP9_COMP *cpi,
- const TWO_PASS *twopass,
const VP9EncoderConfig *oxcf,
- const FIRSTPASS_STATS *this_frame) {
- const FIRSTPASS_STATS *const stats = &twopass->total_stats;
- const double av_weight = stats->weight / stats->count;
- const double av_err = (stats->coded_error * av_weight) / stats->count;
+ const FIRSTPASS_STATS *this_frame,
+ const double av_err) {
double modified_score =
av_err * pow(this_frame->coded_error * this_frame->weight /
DOUBLE_DIVIDE_CHECK(av_err),
@@ -260,13 +269,12 @@ static double calculate_mod_frame_score(const VP9_COMP *cpi,
return modified_score;
}
+
static double calculate_norm_frame_score(const VP9_COMP *cpi,
const TWO_PASS *twopass,
const VP9EncoderConfig *oxcf,
- const FIRSTPASS_STATS *this_frame) {
- const FIRSTPASS_STATS *const stats = &twopass->total_stats;
- const double av_weight = stats->weight / stats->count;
- const double av_err = (stats->coded_error * av_weight) / stats->count;
+ const FIRSTPASS_STATS *this_frame,
+ const double av_err) {
double modified_score =
av_err * pow(this_frame->coded_error * this_frame->weight /
DOUBLE_DIVIDE_CHECK(av_err),
@@ -723,8 +731,9 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
// Exclude any image dead zone
if (fp_acc_data->image_data_start_row > 0) {
fp_acc_data->intra_skip_count =
- VPXMAX(0, fp_acc_data->intra_skip_count -
- (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+ VPXMAX(0,
+ fp_acc_data->intra_skip_count -
+ (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
}
fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
@@ -1583,6 +1592,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
TWO_PASS *const twopass = &cpi->twopass;
+ double last_group_rate_err;
// Clamp the target rate to VBR min / max limts.
const int target_rate =
@@ -1591,6 +1601,18 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX);
inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+// TODO(jimbankoski): remove #if here or below when this has been
+// well tested.
+#if CONFIG_ALWAYS_ADJUST_BPM
+ // based on recent history adjust expectations of bits per macroblock.
+ last_group_rate_err =
+ (double)twopass->rolling_arf_group_actual_bits /
+ DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+ last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+ twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+ twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
+
if (target_rate <= 0) {
return rc->worst_quality; // Highest value allowed
} else {
@@ -1601,11 +1623,13 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
const double av_err_per_mb = section_err / active_pct;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
- double last_group_rate_err;
const int target_norm_bits_per_mb =
(int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs);
int q;
+// TODO(jimbankoski): remove #if here or above when this has been
+// well tested.
+#if !CONFIG_ALWAYS_ADJUST_BPM
// based on recent history adjust expectations of bits per macroblock.
last_group_rate_err =
(double)twopass->rolling_arf_group_actual_bits /
@@ -1613,6 +1637,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
// Try and pick a max Q that will be high enough to encode the
// content at the given rate.
@@ -1666,7 +1691,7 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width,
void vp9_init_second_pass(VP9_COMP *cpi) {
SVC *const svc = &cpi->svc;
- const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
const int is_two_pass_svc =
(svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
RATE_CONTROL *const rc = &cpi->rc;
@@ -1686,6 +1711,63 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
*stats = *twopass->stats_in_end;
twopass->total_left_stats = *stats;
+ // Scan the first pass file and calculate a modified score for each
+ // frame that is used to distribute bits. The modified score is assumed
+ // to provide a linear basis for bit allocation. I.e a frame A with a score
+ // that is double that of frame B will be allocated 2x as many bits.
+ {
+ double modified_score_total = 0.0;
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double av_err;
+
+ if (oxcf->vbr_corpus_complexity) {
+ twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0;
+ av_err = get_distribution_av_err(cpi, twopass);
+ } else {
+ av_err = get_distribution_av_err(cpi, twopass);
+ // The first scan is unclamped and gives a raw average.
+ while (s < twopass->stats_in_end) {
+ modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err);
+ ++s;
+ }
+
+ // The average error from this first scan is used to define the midpoint
+ // error for the rate distribution function.
+ twopass->mean_mod_score =
+ modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+ }
+
+ // Second scan using clamps based on the previous cycle average.
+ // This may modify the total and average somewhat but we dont bother with
+ // further itterations.
+ modified_score_total = 0.0;
+ s = twopass->stats_in;
+ while (s < twopass->stats_in_end) {
+ modified_score_total +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err);
+ ++s;
+ }
+ twopass->normalized_score_left = modified_score_total;
+
+ // If using Corpus wide VBR mode then update the clip target bandwidth to
+ // reflect how the clip compares to the rest of the corpus.
+ if (oxcf->vbr_corpus_complexity) {
+ oxcf->target_bandwidth =
+ (int64_t)((double)oxcf->target_bandwidth *
+ (twopass->normalized_score_left / stats->count));
+ }
+
+#if COMPLEXITY_STATS_OUTPUT
+ {
+ FILE *compstats;
+ compstats = fopen("complexity_stats.stt", "a");
+ fprintf(compstats, "%10.3lf\n",
+ twopass->normalized_score_left / stats->count);
+ fclose(compstats);
+ }
+#endif
+ }
+
frame_rate = 10000000.0 * stats->count / stats->duration;
// Each frame can have a different duration, as the frame rate in the source
// isn't guaranteed to be constant. The frame rate prior to the first frame
@@ -1708,37 +1790,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// This variable monitors how far behind the second ref update is lagging.
twopass->sr_update_lag = 1;
- // Scan the first pass file and calculate a modified score for each
- // frame that is used to distribute bits. The modified score is assumed
- // to provide a linear basis for bit allocation. I.e a frame A with a score
- // that is double that of frame B will be allocated 2x as many bits.
- {
- const FIRSTPASS_STATS *s = twopass->stats_in;
- double modified_score_total = 0.0;
-
- // The first scan is unclamped and gives a raw average.
- while (s < twopass->stats_in_end) {
- modified_score_total += calculate_mod_frame_score(cpi, twopass, oxcf, s);
- ++s;
- }
-
- // The average error from this first scan is used to define the midpoint
- // error for the rate distribution function.
- twopass->mean_mod_score =
- modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
-
- // Second scan using clamps based on the previous cycle average.
- // This may modify the total and average somewhat but we dont bother with
- // further itterations.
- s = twopass->stats_in;
- modified_score_total = 0.0;
- while (s < twopass->stats_in_end) {
- modified_score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s);
- ++s;
- }
- twopass->normalized_score_left = modified_score_total;
- }
-
// Reset the vbr bits off target counters
rc->vbr_bits_off_target = 0;
rc->vbr_bits_off_target_fast = 0;
@@ -1897,9 +1948,9 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
}
#define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_BOOST 96.0
static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
- double *sr_accumulator,
- double this_frame_mv_in_out, double max_boost) {
+ double this_frame_mv_in_out) {
double frame_boost;
const double lq = vp9_convert_qindex_to_q(
cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
@@ -1908,13 +1959,7 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
// Underlying boost factor is based on inter error ratio.
frame_boost = (BASELINE_ERR_PER_MB * active_area) /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
-
- // Update the accumulator for second ref error difference.
- // This is intended to give an indication of how much the coded error is
- // increasing over time.
- *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
- *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
// Small adjustment for cases where there is a zoom out
if (this_frame_mv_in_out > 0.0)
@@ -1923,7 +1968,7 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
// Q correction and scalling
frame_boost = frame_boost * boost_q_correction;
- return VPXMIN(frame_boost, max_boost * boost_q_correction);
+ return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction);
}
#define KF_BASELINE_ERR_PER_MB 12500.0
@@ -1958,8 +2003,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
return VPXMIN(frame_boost, max_boost * boost_q_correction);
}
-static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
- int *f_boost, int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) {
TWO_PASS *const twopass = &cpi->twopass;
int i;
double boost_score = 0.0;
@@ -1968,13 +2012,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
double this_frame_mv_in_out = 0.0;
double mv_in_out_accumulator = 0.0;
double abs_mv_in_out_accumulator = 0.0;
- double sr_accumulator = 0.0;
int arf_boost;
int flash_detected = 0;
// Search forward from the proposed arf/next gf position.
for (i = 0; i < f_frames; ++i) {
- const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i);
if (this_frame == NULL) break;
// Update the motion related elements to the boost calculation.
@@ -1984,8 +2027,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
// We want to discount the flash frame itself and the recovery
// frame that follows as both will have poor scores.
- flash_detected = detect_flash(twopass, i + offset) ||
- detect_flash(twopass, i + offset + 1);
+ flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1);
// Accumulate the effect of prediction quality decay.
if (!flash_detected) {
@@ -1994,14 +2036,11 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
? MIN_DECAY_FACTOR
: decay_accumulator;
}
-
- sr_accumulator = 0.0;
boost_score += decay_accumulator *
- calc_frame_boost(cpi, this_frame, &sr_accumulator,
- this_frame_mv_in_out, GF_MAX_BOOST);
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out);
}
- *f_boost = (int)boost_score;
+ arf_boost = (int)boost_score;
// Reset for backward looking loop.
boost_score = 0.0;
@@ -2010,11 +2049,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
this_frame_mv_in_out = 0.0;
mv_in_out_accumulator = 0.0;
abs_mv_in_out_accumulator = 0.0;
- sr_accumulator = 0.0;
// Search backward towards last gf position.
for (i = -1; i >= -b_frames; --i) {
- const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i);
if (this_frame == NULL) break;
// Update the motion related elements to the boost calculation.
@@ -2024,8 +2062,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
// We want to discount the the flash frame itself and the recovery
// frame that follows as both will have poor scores.
- flash_detected = detect_flash(twopass, i + offset) ||
- detect_flash(twopass, i + offset + 1);
+ flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1);
// Cumulative effect of prediction quality decay.
if (!flash_detected) {
@@ -2034,17 +2071,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
? MIN_DECAY_FACTOR
: decay_accumulator;
}
-
- sr_accumulator = 0.0;
boost_score += decay_accumulator *
- calc_frame_boost(cpi, this_frame, &sr_accumulator,
- this_frame_mv_in_out, GF_MAX_BOOST);
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out);
}
- *b_boost = (int)boost_score;
+ arf_boost += (int)boost_score;
- arf_boost = (*f_boost + *b_boost);
- if (arf_boost < ((b_frames + f_frames) * 20))
- arf_boost = ((b_frames + f_frames) * 20);
+ if (arf_boost < ((b_frames + f_frames) * 40))
+ arf_boost = ((b_frames + f_frames) * 40);
arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
return arf_boost;
@@ -2105,7 +2138,7 @@ static int calculate_boost_bits(int frame_count, int boost,
int allocation_chunks;
// return 0 for invalid inputs (could arise e.g. through rounding errors)
- if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+ if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0;
allocation_chunks = (frame_count * 100) + boost;
@@ -2133,8 +2166,33 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
arf_buffer_indices[1] = ARF_SLOT2;
}
+// Used in corpus vbr: Calculates the total normalized group complexity score
+// for a given number of frames starting at the current position in the stats
+// file.
+static double calculate_group_score(VP9_COMP *cpi, double av_score,
+ int frame_count) {
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double score_total = 0.0;
+ int i = 0;
+
+ // We dont ever want to return a 0 score here.
+ if (frame_count == 0) return 1.0;
+
+ while ((i < frame_count) && (s < twopass->stats_in_end)) {
+ score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score);
+ ++s;
+ ++i;
+ }
+ assert(i == frame_count);
+
+ return score_total;
+}
+
static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
int gf_arf_bits) {
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
RATE_CONTROL *const rc = &cpi->rc;
TWO_PASS *const twopass = &cpi->twopass;
GF_GROUP *const gf_group = &twopass->gf_group;
@@ -2143,7 +2201,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
int frame_index = 1;
int target_frame_size;
int key_frame;
- const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+ const int max_bits = frame_max_bits(&cpi->rc, oxcf);
int64_t total_group_bits = gf_group_bits;
int mid_boost_bits = 0;
int mid_frame_idx;
@@ -2153,8 +2211,10 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1;
int normal_frames;
int normal_frame_bits;
- int last_frame_bits;
- int last_frame_reduction;
+ int last_frame_reduction = 0;
+ double av_score = 1.0;
+ double tot_norm_frame_score = 1.0;
+ double this_frame_score = 1.0;
// Only encode alt reference frame in temporal base layer.
if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers;
@@ -2226,17 +2286,14 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
-
- // The last frame in the group is used less as a predictor so reduce
- // its allocation a little.
- if (normal_frames > 1) {
+ if (normal_frames > 1)
normal_frame_bits = (int)(total_group_bits / normal_frames);
- last_frame_reduction = normal_frame_bits / 16;
- last_frame_bits = normal_frame_bits - last_frame_reduction;
- } else {
+ else
normal_frame_bits = (int)total_group_bits;
- last_frame_bits = normal_frame_bits;
- last_frame_reduction = 0;
+
+ if (oxcf->vbr_corpus_complexity) {
+ av_score = get_distribution_av_err(cpi, twopass);
+ tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
}
// Allocate bits to the other frames in the group.
@@ -2248,11 +2305,18 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
++frame_index;
}
- target_frame_size = (i == (normal_frames - 1))
- ? last_frame_bits
- : (i == mid_frame_idx)
- ? normal_frame_bits + last_frame_reduction
- : normal_frame_bits;
+ if (oxcf->vbr_corpus_complexity) {
+ this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf,
+ &frame_stats, av_score);
+ normal_frame_bits = (int)((double)total_group_bits *
+ (this_frame_score / tot_norm_frame_score));
+ }
+
+ target_frame_size = normal_frame_bits;
+ if ((i == (normal_frames - 1)) && (i >= 1)) {
+ last_frame_reduction = normal_frame_bits / 16;
+ target_frame_size -= last_frame_reduction;
+ }
if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
mid_boost_bits += (target_frame_size >> 4);
@@ -2273,6 +2337,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
++frame_index;
}
+ // Add in some extra bits for the middle frame in the group.
+ gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction;
+
// Note:
// We need to configure the frame at the end of the sequence + 1 that will be
// the start frame for the next group. Otherwise prior to the call to
@@ -2316,6 +2383,8 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
// Analyse and define a gf/arf group.
#define ARF_DECAY_BREAKOUT 0.10
+#define ARF_ABS_ZOOM_THRESH 4.0
+
static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
@@ -2325,8 +2394,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
int i;
- double boost_score = 0.0;
- double old_boost_score = 0.0;
double gf_group_err = 0.0;
double gf_group_raw_error = 0.0;
double gf_group_noise = 0.0;
@@ -2338,7 +2405,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double mod_frame_err = 0.0;
double mv_ratio_accumulator = 0.0;
- double decay_accumulator = 1.0;
double zero_motion_accumulator = 1.0;
double loop_decay_rate = 1.00;
double last_loop_decay_rate = 1.00;
@@ -2347,13 +2413,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double mv_in_out_accumulator = 0.0;
double abs_mv_in_out_accumulator = 0.0;
double mv_ratio_accumulator_thresh;
- double mv_in_out_thresh;
double abs_mv_in_out_thresh;
double sr_accumulator = 0.0;
+ const double av_err = get_distribution_av_err(cpi, twopass);
unsigned int allow_alt_ref = is_altref_enabled(cpi);
- int f_boost = 0;
- int b_boost = 0;
int flash_detected;
int active_max_gf_interval;
int active_min_gf_interval;
@@ -2372,7 +2436,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
vp9_zero(next_frame);
// Load stats for the current frame.
- mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+ mod_frame_err =
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
// Note the error of the frame at the start of the group. This will be
// the GF frame error if we code a normal gf.
@@ -2393,8 +2458,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Motion breakout threshold for loop below depends on image size.
mv_ratio_accumulator_thresh =
(cpi->initial_height + cpi->initial_width) / 4.0;
- mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0;
- abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0;
+ abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH;
// Set a maximum and minimum interval for the GF group.
// If the image appears almost completely static we can extend beyond this.
@@ -2438,7 +2502,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
++i;
// Accumulate error score of frames in this gf group.
- mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+ mod_frame_err =
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
gf_group_err += mod_frame_err;
gf_group_raw_error += this_frame->coded_error;
gf_group_noise += this_frame->frame_noise_energy;
@@ -2463,8 +2528,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
last_loop_decay_rate = loop_decay_rate;
loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
- decay_accumulator = decay_accumulator * loop_decay_rate;
-
// Monitor for static sections.
zero_motion_accumulator = VPXMIN(
zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
@@ -2476,13 +2539,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
allow_alt_ref = 0;
break;
}
- }
- // Calculate a boost number for this frame.
- sr_accumulator = 0.0;
- boost_score += decay_accumulator *
- calc_frame_boost(cpi, &next_frame, &sr_accumulator,
- this_frame_mv_in_out, GF_MAX_BOOST);
+ // Update the accumulator for second ref error difference.
+ // This is intended to give an indication of how much the coded error is
+ // increasing over time.
+ if (i == 1) {
+ sr_accumulator += next_frame.coded_error;
+ } else {
+ sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
+ }
+ }
// Break out conditions.
if (
@@ -2496,14 +2562,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
(!flash_detected) &&
((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
(abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
- (mv_in_out_accumulator < -mv_in_out_thresh) ||
- (decay_accumulator < ARF_DECAY_BREAKOUT)))) {
- boost_score = old_boost_score;
+ (sr_accumulator > next_frame.intra_error)))) {
break;
}
*this_frame = next_frame;
- old_boost_score = boost_score;
}
// Was the group length constrained by the requirement for a new KF?
@@ -2512,9 +2575,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Should we use the alternate reference frame.
if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
(i >= rc->min_gf_interval)) {
+ const int forward_frames = (rc->frames_to_key - i >= i - 1)
+ ? i - 1
+ : VPXMAX(0, rc->frames_to_key - i);
+
// Calculate the boost for alt ref.
- rc->gfu_boost =
- calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+ rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1));
rc->source_alt_ref_pending = 1;
// Test to see if multi arf is appropriate.
@@ -2524,7 +2590,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
? 1
: 0;
} else {
- rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
+ rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1));
rc->source_alt_ref_pending = 0;
}
@@ -2548,7 +2614,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
if (EOF == input_stats(twopass, this_frame)) break;
gf_group_err +=
- calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
gf_group_raw_error += this_frame->coded_error;
gf_group_noise += this_frame->frame_noise_energy;
gf_group_skip_pct += this_frame->intra_skip_pct;
@@ -2587,6 +2653,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
group_av_noise, vbr_group_bits_per_frame);
twopass->active_worst_quality =
(tmp_q + (twopass->active_worst_quality * 3)) >> 2;
+
+#if CONFIG_ALWAYS_ADJUST_BPM
+ // Reset rolling actual and target bits counters for ARF groups.
+ twopass->rolling_arf_group_target_bits = 0;
+ twopass->rolling_arf_group_actual_bits = 0;
+#endif
}
// Context Adjustment of ARNR filter strength
@@ -2621,10 +2693,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Default to starting GF groups at normal frame size.
cpi->rc.next_frame_size_selector = UNSCALED;
}
-
+#if !CONFIG_ALWAYS_ADJUST_BPM
// Reset rolling actual and target bits counters for ARF groups.
twopass->rolling_arf_group_target_bits = 0;
twopass->rolling_arf_group_actual_bits = 0;
+#endif
}
// Threshold for use of the lagging second reference frame. High second ref
@@ -2769,7 +2842,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double kf_group_err = 0.0;
double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
double sr_accumulator = 0.0;
-
+ const double av_err = get_distribution_av_err(cpi, twopass);
vp9_zero(next_frame);
cpi->common.frame_type = KEY_FRAME;
@@ -2793,7 +2866,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
twopass->kf_group_bits = 0; // Total bits available to kf group
twopass->kf_group_error_left = 0.0; // Group modified error score.
- kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+ kf_mod_err =
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
// Initialize the decay rates for the recent frames to check
for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
@@ -2803,7 +2877,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
while (twopass->stats_in < twopass->stats_in_end &&
rc->frames_to_key < cpi->oxcf.key_freq) {
// Accumulate kf group error.
- kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+ kf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
// Load the next frame's stats.
last_frame = *this_frame;
@@ -2864,7 +2939,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Rescan to get the correct error data for the forced kf group.
for (i = 0; i < rc->frames_to_key; ++i) {
kf_group_err +=
- calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame);
+ calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame, av_err);
input_stats(twopass, &tmp_frame);
}
rc->next_key_frame_forced = 1;
@@ -2882,7 +2957,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
if (EOF == input_stats(twopass, this_frame)) break;
kf_group_err +=
- calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
}
rc->frames_to_key = new_frame_to_key;
}
@@ -2890,7 +2965,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Special case for the last key frame of the file.
if (twopass->stats_in >= twopass->stats_in_end) {
// Accumulate kf group error.
- kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+ kf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
}
// Calculate the number of bits that should be assigned to the kf group.
diff --git a/libvpx/vp9/encoder/vp9_frame_scale.c b/libvpx/vp9/encoder/vp9_frame_scale.c
index e58628388..a410d0407 100644
--- a/libvpx/vp9/encoder/vp9_frame_scale.c
+++ b/libvpx/vp9/encoder/vp9_frame_scale.c
@@ -20,8 +20,6 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
INTERP_FILTER filter_type, int phase_scaler) {
const int src_w = src->y_crop_width;
const int src_h = src->y_crop_height;
- const int dst_w = dst->y_crop_width;
- const int dst_h = dst->y_crop_height;
const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
src->v_buffer };
const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
@@ -30,23 +28,86 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
int x, y, i;
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- const int factor = (i == 0 || i == 3 ? 1 : 2);
- const int src_stride = src_strides[i];
- const int dst_stride = dst_strides[i];
- for (y = 0; y < dst_h; y += 16) {
- const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
- for (x = 0; x < dst_w; x += 16) {
- const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
- const uint8_t *src_ptr = srcs[i] +
- (y / factor) * src_h / dst_h * src_stride +
- (x / factor) * src_w / dst_w;
- uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+#if HAVE_SSSE3 || HAVE_NEON
+ // TODO(linfengz): The 4:3 specialized C code is disabled by default since
+ // it's much slower than the general version which calls vpx_scaled_2d() even
+ // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference
+ // for the platforms which have faster optimization.
+ if (4 * dst->y_crop_width == 3 * src_w &&
+ 4 * dst->y_crop_height == 3 * src_h) {
+ // Specialize 4 to 3 scaling.
+ // Example pixel locations.
+ // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.)
+ // phase_scaler = 0 | phase_scaler = 8
+ // |
+ // X O S O S O X | O O O O O
+ // |
+ // |
+ // | S S S
+ // |
+ // |
+ // O O O O O | O O O O O
+ // |
+ // S S S S |
+ // |
+ // |
+ // | S S S
+ // O O O O O | O O O O O
+ // |
+ // |
+ // |
+ // S S S S |
+ // |
+ // O O O O O | O O O O O
+ // | S S S
+ // |
+ // |
+ // |
+ // |
+ // X O S O S O X | O O O O O
- vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
- kernel[x_q4 & 0xf], 16 * src_w / dst_w,
- kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
- 16 / factor);
+ const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width,
+ dst->uv_crop_width };
+ const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height,
+ dst->uv_crop_height };
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ const int dst_w = dst_ws[i];
+ const int dst_h = dst_hs[i];
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ for (y = 0; y < dst_h; y += 3) {
+ for (x = 0; x < dst_w; x += 3) {
+ const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3;
+ uint8_t *dst_ptr = dsts[i] + y * dst_stride + x;
+
+ // Must call c function because its optimization doesn't support 3x3.
+ vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3);
+ }
+ }
+ }
+ } else
+#endif
+ {
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ const int factor = (i == 0 || i == 3 ? 1 : 2);
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ for (y = 0; y < dst_h; y += 16) {
+ const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+ for (x = 0; x < dst_w; x += 16) {
+ const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+ const uint8_t *src_ptr = srcs[i] +
+ (y / factor) * src_h / dst_h * src_stride +
+ (x / factor) * src_w / dst_w;
+ uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+ vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16 / factor, 16 / factor);
+ }
}
}
}
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 24e23af3b..44f01be25 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -361,7 +361,7 @@ static unsigned int setup_center_error(
#endif // CONFIG_VP9_HIGHBITDEPTH
}
-static INLINE int divide_and_round(const int n, const int d) {
+static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) {
return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
}
@@ -379,10 +379,13 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) {
// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
// The code below is an integerized version of that.
static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
- *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
- (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
- *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
- (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+ const int64_t x0 = (int64_t)cost_list[1] - cost_list[3];
+ const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3];
+ const int64_t x1 = (int64_t)cost_list[4] - cost_list[2];
+ const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2];
+ const int b = 1 << (bits - 1);
+ *ic = (int)divide_and_round(x0 * b, y0);
+ *ir = (int)divide_and_round(x1 * b, y1);
}
uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
@@ -441,7 +444,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
int ir, ic;
- unsigned int minpt;
+ unsigned int minpt = INT_MAX;
get_cost_surf_min(cost_list, &ir, &ic, 2);
if (ir != 0 || ic != 0) {
CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
@@ -2039,197 +2042,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
return bestsme;
}
-int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
- int sad_per_bit, int distance,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv, MV *best_mv) {
- int r, c;
- const MACROBLOCKD *const xd = &x->e_mbd;
- const struct buf_2d *const what = &x->plane[0].src;
- const struct buf_2d *const in_what = &xd->plane[0].pre[0];
- const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
- const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
- const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
- const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
- const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
- int best_sad =
- fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
- in_what->stride) +
- mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
- *best_mv = *ref_mv;
-
- for (r = row_min; r < row_max; ++r) {
- for (c = col_min; c < col_max; ++c) {
- const MV mv = { r, c };
- const int sad =
- fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
- in_what->stride) +
- mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
- }
- }
- return best_sad;
-}
-
-int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
- int sad_per_bit, int distance,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv, MV *best_mv) {
- int r;
- const MACROBLOCKD *const xd = &x->e_mbd;
- const struct buf_2d *const what = &x->plane[0].src;
- const struct buf_2d *const in_what = &xd->plane[0].pre[0];
- const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
- const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
- const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
- const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
- const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
- unsigned int best_sad =
- fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
- in_what->stride) +
- mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
- *best_mv = *ref_mv;
-
- for (r = row_min; r < row_max; ++r) {
- int c = col_min;
- const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
- if (fn_ptr->sdx3f != NULL) {
- while ((c + 2) < col_max) {
- int i;
- DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
- fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
- sads);
-
- for (i = 0; i < 3; ++i) {
- unsigned int sad = sads[i];
- if (sad < best_sad) {
- const MV mv = { r, c };
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
- }
- ++check_here;
- ++c;
- }
- }
- }
-
- while (c < col_max) {
- unsigned int sad =
- fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
- if (sad < best_sad) {
- const MV mv = { r, c };
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
- }
- ++check_here;
- ++c;
- }
- }
-
- return best_sad;
-}
-
-int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
- int sad_per_bit, int distance,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv, MV *best_mv) {
- int r;
- const MACROBLOCKD *const xd = &x->e_mbd;
- const struct buf_2d *const what = &x->plane[0].src;
- const struct buf_2d *const in_what = &xd->plane[0].pre[0];
- const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
- const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
- const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
- const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
- const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
- unsigned int best_sad =
- fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
- in_what->stride) +
- mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
- *best_mv = *ref_mv;
-
- for (r = row_min; r < row_max; ++r) {
- int c = col_min;
- const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
- if (fn_ptr->sdx8f != NULL) {
- while ((c + 7) < col_max) {
- int i;
- DECLARE_ALIGNED(16, uint32_t, sads[8]);
-
- fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
- sads);
-
- for (i = 0; i < 8; ++i) {
- unsigned int sad = sads[i];
- if (sad < best_sad) {
- const MV mv = { r, c };
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
- }
- ++check_here;
- ++c;
- }
- }
- }
-
- if (fn_ptr->sdx3f != NULL) {
- while ((c + 2) < col_max) {
- int i;
- DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
- fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
- sads);
-
- for (i = 0; i < 3; ++i) {
- unsigned int sad = sads[i];
- if (sad < best_sad) {
- const MV mv = { r, c };
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
- }
- ++check_here;
- ++c;
- }
- }
- }
-
- while (c < col_max) {
- unsigned int sad =
- fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
- if (sad < best_sad) {
- const MV mv = { r, c };
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
- }
- ++check_here;
- ++c;
- }
- }
-
- return best_sad;
-}
-
int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
int search_range,
const vp9_variance_fn_ptr_t *fn_ptr,
diff --git a/libvpx/vp9/encoder/vp9_noise_estimate.c b/libvpx/vp9/encoder/vp9_noise_estimate.c
index e2239b44b..276a0c785 100644
--- a/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -21,6 +21,15 @@
#include "vp9/encoder/vp9_noise_estimate.h"
#include "vp9/encoder/vp9_encoder.h"
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
+ return (!cpi->use_svc ||
+ (cpi->use_svc &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
ne->enabled = 0;
ne->level = kLowLow;
@@ -34,7 +43,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
} else if (width * height >= 1280 * 720) {
ne->thresh = 140;
} else if (width * height >= 640 * 360) {
- ne->thresh = 100;
+ ne->thresh = 115;
}
ne->num_frames_estimate = 15;
}
@@ -45,7 +54,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
#endif
// Enable noise estimation if denoising is on.
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
cpi->common.width >= 320 && cpi->common.height >= 180)
return 1;
#endif
@@ -56,8 +65,8 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc &&
- cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 &&
- cpi->common.height >= 360)
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+ cpi->common.width * cpi->common.height >= 640 * 360)
return 1;
else
return 0;
@@ -111,7 +120,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
// Estimate is between current source and last source.
YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) {
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
last_source = &cpi->denoiser.last_source;
// Tune these thresholds for different resolutions when denoising is
// enabled.
@@ -131,7 +140,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
(cpi->svc.number_spatial_layers == 1 &&
(ne->last_w != cm->width || ne->last_h != cm->height))) {
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
copy_frame(&cpi->denoiser.last_source, cpi->Source);
#endif
if (last_source != NULL) {
@@ -146,7 +155,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
ne->count = 0;
ne->num_frames_estimate = 10;
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
cpi->svc.current_superframe > 1) {
vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
copy_frame(&cpi->denoiser.last_source, cpi->Source);
@@ -190,44 +199,42 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
int bl_index1 = bl_index + 1;
int bl_index2 = bl_index + cm->mi_cols;
int bl_index3 = bl_index2 + 1;
- // Only consider blocks that are likely steady background. i.e, have
- // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
- // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
- // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
int consec_zeromv =
VPXMIN(cpi->consec_zero_mv[bl_index],
VPXMIN(cpi->consec_zero_mv[bl_index1],
VPXMIN(cpi->consec_zero_mv[bl_index2],
cpi->consec_zero_mv[bl_index3])));
- int is_skin = 0;
- if (cpi->use_skin_detection) {
- is_skin =
- vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
- src_uvstride, bsize, consec_zeromv, 0);
- }
- if (frame_low_motion &&
- cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
- cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
- cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
- cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
- !is_skin) {
- // Compute variance.
- unsigned int sse;
- unsigned int variance = cpi->fn_ptr[bsize].vf(
- src_y, src_ystride, last_src_y, last_src_ystride, &sse);
- // Only consider this block as valid for noise measurement if the
- // average term (sse - variance = N * avg^{2}, N = 16X16) of the
- // temporal residual is small (avoid effects from lighting change).
- if ((sse - variance) < thresh_sum_diff) {
- unsigned int sse2;
- const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
- src_y, src_ystride, const_source, 0, &sse2);
- // Avoid blocks with high brightness and high spatial variance.
- if ((sse2 - spatial_variance) < thresh_sum_spatial &&
- spatial_variance < thresh_spatial_var) {
- avg_est += low_res ? variance >> 4
- : variance / ((spatial_variance >> 9) + 1);
- num_samples++;
+ // Only consider blocks that are likely steady background. i.e, have
+ // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+ // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+ // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+ if (frame_low_motion && consec_zeromv > thresh_consec_zeromv) {
+ int is_skin = 0;
+ if (cpi->use_skin_detection) {
+ is_skin =
+ vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+ src_uvstride, bsize, consec_zeromv, 0);
+ }
+ if (!is_skin) {
+ unsigned int sse;
+ // Compute variance.
+ unsigned int variance = cpi->fn_ptr[bsize].vf(
+ src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+ // Only consider this block as valid for noise measurement if the
+ // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+ // temporal residual is small (avoid effects from lighting
+ // change).
+ if ((sse - variance) < thresh_sum_diff) {
+ unsigned int sse2;
+ const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
+ src_y, src_ystride, const_source, 0, &sse2);
+ // Avoid blocks with high brightness and high spatial variance.
+ if ((sse2 - spatial_variance) < thresh_sum_spatial &&
+ spatial_variance < thresh_spatial_var) {
+ avg_est += low_res ? variance >> 4
+ : variance / ((spatial_variance >> 9) + 1);
+ num_samples++;
+ }
}
}
}
@@ -259,14 +266,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
ne->count = 0;
ne->level = vp9_noise_estimate_extract_level(ne);
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
#endif
}
}
}
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
copy_frame(&cpi->denoiser.last_source, cpi->Source);
#endif
}
diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c
index b05f4184b..f2f323a28 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libvpx/vp9/encoder/vp9_pickmode.c
@@ -158,6 +158,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
const MvLimits tmp_mv_limits = x->mv_limits;
int rv = 0;
int cost_list[5];
+ int search_subpel = 1;
const YV12_BUFFER_CONFIG *scaled_ref_frame =
vp9_get_scaled_ref_frame(cpi, ref);
if (scaled_ref_frame) {
@@ -192,9 +193,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
else
center_mv = tmp_mv->as_mv;
- vp9_full_pixel_search(
- cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
- cond_cost_list(cpi, cost_list), &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+ if (x->sb_use_mv_part) {
+ tmp_mv->as_mv.row = x->sb_mvrow_part >> 3;
+ tmp_mv->as_mv.col = x->sb_mvcol_part >> 3;
+ } else {
+ vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+ cond_cost_list(cpi, cost_list), &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+ }
x->mv_limits = tmp_mv_limits;
@@ -210,8 +216,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
rv =
!(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
- if (rv) {
- const int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+ // For SVC on non-reference frame, avoid subpel for (0, 0) motion.
+ if (cpi->use_svc && cpi->svc.non_reference_frame) {
+ if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0;
+ }
+
+ if (rv && search_subpel) {
+ int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+ if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2;
cpi->find_fractional_mv_step(
x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
@@ -318,7 +330,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
unsigned int *var_y, unsigned int *sse_y,
- int mi_row, int mi_col, int *early_term) {
+ int mi_row, int mi_col, int *early_term,
+ int *flag_preduv_computed) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -475,6 +488,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
int j = i - 1;
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+ flag_preduv_computed[i - 1] = 1;
var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
@@ -664,7 +678,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
#endif
if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME &&
- bsize < BLOCK_32X32) {
+ (bsize < BLOCK_32X32 ||
+ (cpi->use_svc &&
+ (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) {
unsigned int var_y, sse_y;
(void)tx_size;
if (!rd_computed)
@@ -711,7 +727,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
scan_order->iscan);
break;
case TX_4X4:
- x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
scan_order->iscan);
@@ -846,13 +862,11 @@ static void free_pred_buffer(PRED_BUFFER *p) {
if (p != NULL) p->in_use = 0;
}
-static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
- int mi_row, int mi_col,
- MV_REFERENCE_FRAME ref_frame,
- PREDICTION_MODE this_mode, unsigned int var_y,
- unsigned int sse_y,
- struct buf_2d yv12_mb[][MAX_MB_PLANE],
- int *rate, int64_t *dist) {
+static void encode_breakout_test(
+ VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y,
+ unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate,
+ int64_t *dist, int *flag_preduv_computed) {
MACROBLOCKD *xd = &x->e_mbd;
MODE_INFO *const mi = xd->mi[0];
const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
@@ -862,6 +876,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
// Skipping threshold for dc.
unsigned int thresh_dc;
int motion_low = 1;
+ if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return;
if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 ||
mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64)
motion_low = 0;
@@ -912,9 +927,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
thresh_dc_uv = 0;
}
- // Skip UV prediction unless breakout is zero (lossless) to save
- // computation with low impact on the result
- if (x->encode_breakout == 0) {
+ if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) {
xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
@@ -1163,33 +1176,22 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
{ ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV },
{ ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV }
};
-static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
+
+#define RT_INTER_MODES_SVC 8
+static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = {
{ LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV },
{ LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV },
{ GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
{ LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV }
};
-static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
- const VP9_COMMON *const cm = &cpi->common;
- // Reduce the intra cost penalty for small blocks (<=16x16).
- int reduction_fac =
- (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
- if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
- // Don't reduce intra cost penalty if estimated noise level is high.
- reduction_fac = 0;
- return vp9_get_intra_cost_penalty(cm->base_qindex, cm->y_dc_delta_q,
- cm->bit_depth) >>
- reduction_fac;
-}
-
static INLINE void find_predictors(
VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
- int force_skip_low_temp_var) {
+ int force_skip_low_temp_var, int comp_pred_allowed) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
@@ -1203,7 +1205,7 @@ static INLINE void find_predictors(
int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
- if (cm->use_prev_frame_mvs) {
+ if (cm->use_prev_frame_mvs || comp_pred_allowed) {
vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
x->mbmi_ext->mode_context);
} else {
@@ -1425,10 +1427,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
struct macroblockd_plane *const pd = &xd->plane[0];
PREDICTION_MODE best_mode = ZEROMV;
MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
- MV_REFERENCE_FRAME usable_ref_frame;
+ MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
TX_SIZE best_tx_size = TX_SIZES;
INTERP_FILTER best_pred_filter = EIGHTTAP;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
VP9_ALT_FLAG };
@@ -1437,7 +1440,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
// var_y and sse_y are saved to be used in skipping checking
unsigned int var_y = UINT_MAX;
unsigned int sse_y = UINT_MAX;
- const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
+ const int intra_cost_penalty =
+ vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
int64_t inter_mode_thresh =
RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0);
const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
@@ -1483,6 +1487,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int force_skip_low_temp_var = 0;
int skip_ref_find_pred[4] = { 0 };
unsigned int sse_zeromv_normalized = UINT_MAX;
+ unsigned int best_sse_sofar = UINT_MAX;
unsigned int thresh_svc_skip_golden = 500;
#if CONFIG_VP9_TEMPORAL_DENOISING
VP9_PICKMODE_CTX_DEN ctx_den;
@@ -1490,9 +1495,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int denoise_svc_pickmode = 1;
#endif
INTERP_FILTER filter_gf_svc = EIGHTTAP;
+ MV_REFERENCE_FRAME best_second_ref_frame = NONE;
+ int comp_modes = 0;
+ int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
+ int flag_svc_subpel = 0;
+ int svc_mv_col = 0;
+ int svc_mv_row = 0;
init_ref_frame_cost(cm, xd, ref_frame_cost);
+ memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
+
if (reuse_inter_pred) {
int i;
for (i = 0; i < 3; i++) {
@@ -1561,7 +1574,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
}
#endif
- if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) {
+ if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc &&
+ !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) {
usable_ref_frame = LAST_FRAME;
} else {
usable_ref_frame = GOLDEN_FRAME;
@@ -1575,6 +1589,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
skip_ref_find_pred[LAST_FRAME] = 1;
skip_ref_find_pred[GOLDEN_FRAME] = 1;
}
+ if (!cm->show_frame) {
+ if (cpi->rc.frames_since_key == 1) {
+ usable_ref_frame = LAST_FRAME;
+ skip_ref_find_pred[GOLDEN_FRAME] = 1;
+ skip_ref_find_pred[ALTREF_FRAME] = 1;
+ }
+ }
}
// For svc mode, on spatial_layer_id > 0: if the reference has different scale
@@ -1609,18 +1630,39 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
- x->last_sb_high_content > 40))
+ x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
usable_ref_frame = LAST_FRAME;
+ // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF.
+ if (cm->reference_mode == REFERENCE_MODE_SELECT &&
+ cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
+ comp_modes = 2;
+
for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
if (!skip_ref_find_pred[ref_frame]) {
find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
&ref_frame_skip_mask, flag_list, tile_data, mi_row,
- mi_col, yv12_mb, bsize, force_skip_low_temp_var);
+ mi_col, yv12_mb, bsize, force_skip_low_temp_var,
+ comp_modes > 0);
}
}
- for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+ if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
+ x->sb_use_mv_part = 0;
+
+ // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
+ // an averaging filter for downsampling (phase = 8). If so, we will test
+ // a nonzero motion mode on the spatial (goldeen) reference.
+ // The nonzero motion is half pixel shifted to left and top (-4, -4).
+ if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+ svc_force_zero_mode[GOLDEN_FRAME - 1] &&
+ cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
+ svc_mv_col = -4;
+ svc_mv_row = -4;
+ flag_svc_subpel = 1;
+ }
+
+ for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
int rate_mv = 0;
int mode_rd_thresh;
int mode_index;
@@ -1629,17 +1671,56 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int is_skippable;
int this_early_term = 0;
int rd_computed = 0;
+ int flag_preduv_computed[2] = { 0 };
+ int inter_mv_mode = 0;
+ int skip_this_mv = 0;
+ int comp_pred = 0;
+ int force_gf_mv = 0;
+ PREDICTION_MODE this_mode;
+ second_ref_frame = NONE;
+
+ if (idx < num_inter_modes) {
+ this_mode = ref_mode_set[idx].pred_mode;
+ ref_frame = ref_mode_set[idx].ref_frame;
+
+ if (cpi->use_svc) {
+ this_mode = ref_mode_set_svc[idx].pred_mode;
+ ref_frame = ref_mode_set_svc[idx].ref_frame;
+ }
+ } else {
+ // Add (0,0) compound modes.
+ this_mode = ZEROMV;
+ ref_frame = LAST_FRAME;
+ if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME;
+ second_ref_frame = ALTREF_FRAME;
+ comp_pred = 1;
+ }
- PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+ if (ref_frame > usable_ref_frame) continue;
+ if (skip_ref_find_pred[ref_frame]) continue;
- ref_frame = ref_mode_set[idx].ref_frame;
+ if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
+ force_gf_mv = 1;
+ // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+ // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+ if (this_mode == NEWMV) {
+ frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
+ frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
+ } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
+ frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
+ continue;
+ }
+ }
- if (cpi->use_svc) {
- this_mode = ref_mode_set_svc[idx].pred_mode;
- ref_frame = ref_mode_set_svc[idx].ref_frame;
+ if (comp_pred) {
+ const struct segmentation *const seg = &cm->seg;
+ if (!cpi->allow_comp_inter_inter) continue;
+ // Skip compound inter modes if ARF is not available.
+ if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
}
- if (ref_frame > usable_ref_frame) continue;
- if (skip_ref_find_pred[ref_frame]) continue;
// For SVC, skip the golden (spatial) reference search if sse of zeromv_last
// is below threshold.
@@ -1660,13 +1741,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
frame_mv[this_mode][ref_frame].as_int != 0))
continue;
- if (cpi->rc.alt_ref_gf_group &&
+ if (!cm->show_frame && ref_frame == ALTREF_FRAME &&
+ frame_mv[this_mode][ref_frame].as_int != 0)
+ continue;
+
+ if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) &&
ref_frame == GOLDEN_FRAME &&
frame_mv[this_mode][ref_frame].as_int != 0)
continue;
- if (cpi->rc.alt_ref_gf_group &&
+ if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
+ cpi->rc.frames_since_golden > 0 &&
cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) &&
ref_frame == ALTREF_FRAME &&
frame_mv[this_mode][ref_frame].as_int != 0)
@@ -1680,12 +1766,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
// Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
// is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
// later.
- if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+ if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
frame_mv[this_mode][ref_frame].as_int != 0) {
continue;
}
- if ((cpi->sf.short_circuit_low_temp_var >= 2 ||
+ if (x->content_state_sb != kVeryHighSad &&
+ (cpi->sf.short_circuit_low_temp_var >= 2 ||
(cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
force_skip_low_temp_var && ref_frame == LAST_FRAME &&
this_mode == NEWMV) {
@@ -1693,7 +1780,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
}
if (cpi->use_svc) {
- if (svc_force_zero_mode[ref_frame - 1] &&
+ if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
frame_mv[this_mode][ref_frame].as_int != 0)
continue;
}
@@ -1723,11 +1810,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (ref_frame_skip_mask & (1 << ref_frame)) continue;
// Select prediction reference frames.
- for (i = 0; i < MAX_MB_PLANE; i++)
+ for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
mi->ref_frame[0] = ref_frame;
- set_ref_ptrs(cm, xd, ref_frame, NONE);
+ mi->ref_frame[1] = second_ref_frame;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1
@@ -1747,12 +1837,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
&rd_thresh_freq_fact[mode_index])))
continue;
- if (this_mode == NEWMV) {
+ if (this_mode == NEWMV && !force_gf_mv) {
if (ref_frame > LAST_FRAME && !cpi->use_svc &&
cpi->oxcf.rc_mode == VPX_CBR) {
int tmp_sad;
uint32_t dis;
- int cost_list[5];
+ int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
if (bsize < BLOCK_16X16) continue;
@@ -1780,17 +1870,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
} else if (svc->use_base_mv && svc->spatial_layer_id) {
if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
const int pre_stride = xd->plane[0].pre[0].stride;
- int base_mv_sad = INT_MAX;
- const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f;
+ unsigned int base_mv_sse = UINT_MAX;
+ int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
const uint8_t *const pre_buf =
xd->plane[0].pre[0].buf +
(frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
(frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
- base_mv_sad = cpi->fn_ptr[bsize].sdf(
- x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride);
+ cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ pre_buf, pre_stride, &base_mv_sse);
+
+ // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
+ // for SVC encoding.
+ if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 &&
+ frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+ frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+ continue;
- if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) {
+ // Exit NEWMV search if base_mv_sse is large.
+ if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+ continue;
+ if (base_mv_sse < (best_sse_sofar << 1)) {
// Base layer mv is good.
+ // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+ // (0, 0) mode is already tested.
+ unsigned int base_mv_sse_normalized =
+ base_mv_sse >>
+ (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+ base_mv_sse_normalized < 400 &&
+ frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+ frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+ continue;
if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
&frame_mv[NEWMV][ref_frame], &rate_mv,
best_rdc.rdcost, 1)) {
@@ -1813,6 +1923,22 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
}
}
+ // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector
+ // causes some regression, leave it for duplicate zero-mv for now, until
+ // regression issue is resolved.
+ for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) {
+ if (inter_mv_mode == this_mode || comp_pred) continue;
+ if (mode_checked[inter_mv_mode][ref_frame] &&
+ frame_mv[this_mode][ref_frame].as_int ==
+ frame_mv[inter_mv_mode][ref_frame].as_int &&
+ frame_mv[inter_mv_mode][ref_frame].as_int == 0) {
+ skip_this_mv = 1;
+ break;
+ }
+ }
+
+ if (skip_this_mv) continue;
+
// If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
// need to compute best_pred_sad which is only used to skip golden NEWMV.
if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME &&
@@ -1827,13 +1953,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
}
- if (this_mode != NEARESTMV &&
+ if (this_mode != NEARESTMV && !comp_pred &&
frame_mv[this_mode][ref_frame].as_int ==
frame_mv[NEARESTMV][ref_frame].as_int)
continue;
mi->mode = this_mode;
mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+ mi->mv[1].as_int = 0;
// Search for the best prediction filter type, when the resulting
// motion vector is at sub-pixel accuracy level for luma component, i.e.,
@@ -1851,7 +1978,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
pred_filter_search &&
(ref_frame == LAST_FRAME ||
- (ref_frame == GOLDEN_FRAME &&
+ (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
(cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
(((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
int pf_rate[3];
@@ -1907,9 +2034,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
pd->dst.stride = this_mode_pred->stride;
}
} else {
- const int large_block = (x->sb_is_skin || cpi->oxcf.speed < 7)
- ? bsize > BLOCK_32X32
- : bsize >= BLOCK_32X32;
+ // For low motion content use x->sb_is_skin in addition to VeryHighSad
+ // for setting large_block.
+ const int large_block =
+ (x->content_state_sb == kVeryHighSad ||
+ (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) ||
+ cpi->oxcf.speed < 7)
+ ? bsize > BLOCK_32X32
+ : bsize >= BLOCK_32X32;
mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
@@ -1924,7 +2056,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
cm->base_qindex) {
model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
&this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
- &this_early_term);
+ &this_early_term, flag_preduv_computed);
} else {
rd_computed = 1;
model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
@@ -1936,6 +2068,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
sse_zeromv_normalized =
sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
}
+ if (sse_y < best_sse_sofar) best_sse_sofar = sse_y;
}
if (!this_early_term) {
@@ -1968,13 +2101,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
}
- if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+ if (!this_early_term &&
+ (x->color_sensitivity[0] || x->color_sensitivity[1])) {
RD_COST rdc_uv;
const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
- if (x->color_sensitivity[0])
+ if (x->color_sensitivity[0] && !flag_preduv_computed[0]) {
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
- if (x->color_sensitivity[1])
+ flag_preduv_computed[0] = 1;
+ }
+ if (x->color_sensitivity[1] && !flag_preduv_computed[1]) {
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+ flag_preduv_computed[1] = 1;
+ }
model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
this_rdc.rate += rdc_uv.rate;
this_rdc.dist += rdc_uv.dist;
@@ -1983,6 +2121,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
this_rdc.rate += rate_mv;
this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
[INTER_OFFSET(this_mode)];
+ // TODO(marpan): Add costing for compound mode.
this_rdc.rate += ref_frame_cost[ref_frame];
this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
@@ -2002,7 +2141,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cpi->allow_encode_breakout) {
encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
var_y, sse_y, yv12_mb, &this_rdc.rate,
- &this_rdc.dist);
+ &this_rdc.dist, flag_preduv_computed);
if (x->skip) {
this_rdc.rate += rate_mv;
this_rdc.rdcost =
@@ -2022,6 +2161,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
(void)ctx;
#endif
+ mode_checked[this_mode][ref_frame] = 1;
+
if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
best_rdc = this_rdc;
best_mode = this_mode;
@@ -2030,6 +2171,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
best_ref_frame = ref_frame;
best_mode_skip_txfm = x->skip_txfm[0];
best_early_term = this_early_term;
+ best_second_ref_frame = second_ref_frame;
if (reuse_inter_pred) {
free_pred_buffer(best_pred);
@@ -2056,6 +2198,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
x->skip_txfm[0] = best_mode_skip_txfm;
+ mi->ref_frame[1] = best_second_ref_frame;
// For spatial enhancemanent layer: perform intra prediction only if base
// layer is chosen as the reference. Always perform intra prediction if
@@ -2074,7 +2217,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
if (best_rdc.rdcost == INT64_MAX ||
- ((!force_skip_low_temp_var || bsize < BLOCK_32X32) &&
+ ((!force_skip_low_temp_var || bsize < BLOCK_32X32 ||
+ x->content_state_sb == kVeryHighSad) &&
perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh &&
bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
!x->lowvar_highsumdiff)) {
@@ -2095,15 +2239,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
vpx_highbd_convolve_copy(
CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
- NULL, 0, NULL, 0, bw, bh, xd->bd);
+ NULL, 0, 0, 0, 0, bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride, NULL,
- 0, NULL, 0, bw, bh);
+ 0, 0, 0, 0, bw, bh);
#else
vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride, NULL, 0,
- NULL, 0, bw, bh);
+ 0, 0, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
best_pred = this_mode_pred;
}
@@ -2168,8 +2312,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
best_mode = this_mode;
best_intra_tx_size = mi->tx_size;
best_ref_frame = INTRA_FRAME;
+ best_second_ref_frame = NONE;
mi->uv_mode = this_mode;
mi->mv[0].as_int = INVALID_MV;
+ mi->mv[1].as_int = INVALID_MV;
best_mode_skip_txfm = x->skip_txfm[0];
}
}
@@ -2185,6 +2331,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
pd->dst = orig_dst;
mi->mode = best_mode;
mi->ref_frame[0] = best_ref_frame;
+ mi->ref_frame[1] = best_second_ref_frame;
x->skip_txfm[0] = best_mode_skip_txfm;
if (!is_inter_block(mi)) {
@@ -2197,14 +2344,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cm->use_highbitdepth)
vpx_highbd_convolve_copy(
CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
- CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0,
+ CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,
bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
- pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+ pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
#else
vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
- pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+ pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
@@ -2214,6 +2361,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
cpi->denoiser.reset == 0) {
VP9_DENOISER_DECISION decision = COPY_BLOCK;
+ ctx->sb_skip_denoising = 0;
+ // TODO(marpan): There is an issue with denoising when the
+ // superblock partitioning scheme is based on the pickmode.
+ // Remove this condition when the issue is resolved.
+ if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1;
vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
frame_mv, reuse_inter_pred, best_tx_size,
best_mode, best_ref_frame, best_pred_filter,
@@ -2225,6 +2377,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
}
#endif
+ if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME)
+ x->arf_frame_usage++;
+ else if (best_ref_frame != INTRA_FRAME)
+ x->lastgolden_frame_usage++;
+
if (cpi->sf.adaptive_rd_thresh) {
THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index f2a59a4af..09f61ead2 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <math.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
@@ -28,33 +29,33 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *iscan) {
int i, eob = -1;
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Quantization pass: All coefficients with index >= zero_flag are
- // skippable. Note: zero_flag can be zero.
- for (i = 0; i < n_coeffs; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
- tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant_ptr[rc != 0]) >> 16;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
- if (tmp) eob = i;
- }
+ if (tmp) eob = i;
}
*eob_ptr = eob + 1;
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *round_ptr,
const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
@@ -64,24 +65,24 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
int eob = -1;
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
- memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
- memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
- if (!skip_block) {
- // Quantization pass: All coefficients with index >= zero_flag are
- // skippable. Note: zero_flag can be zero.
- for (i = 0; i < count; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp = abs_coeff + round_ptr[rc != 0];
- const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
- qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
- if (abs_qcoeff) eob = i;
- }
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+ const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ if (abs_qcoeff) eob = i;
}
*eob_ptr = eob + 1;
}
@@ -97,28 +98,28 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- for (i = 0; i < n_coeffs; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- int tmp = 0;
- int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
- if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
- abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
- tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- }
-
- if (tmp) eob = i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ int tmp = 0;
+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+ abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
}
+
+ if (tmp) eob = i;
}
*eob_ptr = eob + 1;
}
@@ -132,28 +133,27 @@ void vp9_highbd_quantize_fp_32x32_c(
int i, eob = -1;
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- for (i = 0; i < n_coeffs; i++) {
- uint32_t abs_qcoeff = 0;
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
- if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
- const int64_t tmp =
- abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- }
-
- if (abs_qcoeff) eob = i;
+ for (i = 0; i < n_coeffs; i++) {
+ int abs_qcoeff = 0;
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
}
+
+ if (abs_qcoeff) eob = i;
}
*eob_ptr = eob + 1;
}
@@ -164,22 +164,28 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *p = &x->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
+ tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block),
+ *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const int n_coeffs = 4 * 4;
+
+ if (x->skip_block) {
+ memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff));
+ memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff));
+ return;
+ }
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block,
- p->zbin, p->round, p->quant, p->quant_shift,
- BLOCK_OFFSET(p->qcoeff, block),
- BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant,
+ vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs,
+ x->skip_block, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
&p->eobs[block], scan, iscan);
return;
}
#endif
- vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin,
- p->round, p->quant, p->quant_shift,
- BLOCK_OFFSET(p->qcoeff, block),
- BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block],
- scan, iscan);
+ vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block,
+ p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, &p->eobs[block], scan, iscan);
}
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 27fea5d4e..b7f3a0e89 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -44,11 +44,6 @@
#define MIN_BPB_FACTOR 0.005
#define MAX_BPB_FACTOR 50
-#define FRAME_OVERHEAD_BITS 200
-
-// Use this macro to turn on/off use of alt-refs in one-pass vbr mode.
-#define USE_ALTREF_FOR_ONE_PASS 0
-
#if CONFIG_VP9_HIGHBITDEPTH
#define ASSIGN_MINQ_TABLE(bit_depth, name) \
do { \
@@ -209,24 +204,29 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
const int bpm =
(int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
return VPXMAX(FRAME_OVERHEAD_BITS,
- (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+ (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
}
int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
const RATE_CONTROL *rc = &cpi->rc;
const VP9EncoderConfig *oxcf = &cpi->oxcf;
- const int min_frame_target =
- VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
- if (target < min_frame_target) target = min_frame_target;
- if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
- // If there is an active ARF at this location use the minimum
- // bits on this frame even if it is a constructed arf.
- // The active maximum quantizer insures that an appropriate
- // number of bits will be spent if needed for constructed ARFs.
- target = min_frame_target;
+
+ if (cpi->oxcf.pass != 2) {
+ const int min_frame_target =
+ VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+ if (target < min_frame_target) target = min_frame_target;
+ if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a constructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for constructed ARFs.
+ target = min_frame_target;
+ }
}
+
// Clip the frame target to the maximum allowed value.
if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+
if (oxcf->rc_max_inter_bitrate_pct) {
const int max_rate =
rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
@@ -353,8 +353,10 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
rc->af_ratio_onepass_vbr = 10;
rc->prev_avg_source_sad_lag = 0;
rc->high_source_sad = 0;
+ rc->reset_high_source_sad = 0;
rc->high_source_sad_lagindex = -1;
rc->alt_ref_gf_group = 0;
+ rc->last_frame_is_src_altref = 0;
rc->fac_active_worst_inter = 150;
rc->fac_active_worst_gf = 100;
rc->force_qpmin = 0;
@@ -585,7 +587,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
// In CBR mode, this makes sure q is between oscillating Qs to prevent
// resonance.
- if (cpi->oxcf.rc_mode == VPX_CBR &&
+ if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad &&
(!cpi->oxcf.gf_cbr_boost_pct ||
!(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
(cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
@@ -593,13 +595,6 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
}
-#if USE_ALTREF_FOR_ONE_PASS
- if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 &&
- cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
- cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) {
- q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1);
- }
-#endif
return q;
}
@@ -679,7 +674,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
int active_worst_quality;
int ambient_qp;
unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
- if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+ if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad)
+ return rc->worst_quality;
// For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
// for the first few frames following key frame. These are both initialized
// to worst_quality and updated with (3/4, 1/4) average in postencode_update.
@@ -1011,6 +1007,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
qdelta = vp9_compute_qdelta_by_rate(
&cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
}
+ if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0;
*top_index = active_worst_quality + qdelta;
*top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
}
@@ -1339,6 +1336,28 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
}
}
+static void update_altref_usage(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int sum_ref_frame_usage = 0;
+ int arf_frame_usage = 0;
+ int mi_row, mi_col;
+ if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+ !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame)
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) {
+ int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+ sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] +
+ cpi->count_lastgolden_frame_usage[sboffset];
+ arf_frame_usage += cpi->count_arf_frame_usage[sboffset];
+ }
+ }
+ if (sum_ref_frame_usage > 0) {
+ double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage;
+ cpi->rc.perc_arf_usage =
+ 0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count;
+ }
+}
+
static void compute_frame_low_motion(VP9_COMP *const cpi) {
VP9_COMMON *const cm = &cpi->common;
int mi_row, mi_col;
@@ -1462,8 +1481,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
}
if (oxcf->pass == 0) {
- if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi);
+ if (cm->frame_type != KEY_FRAME) {
+ compute_frame_low_motion(cpi);
+ if (cpi->sf.use_altref_onepass) update_altref_usage(cpi);
+ }
+ cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
}
+ if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
+
+ rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
}
void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1556,8 +1582,9 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
// Adjust boost and af_ratio based on avg_frame_low_motion, which varies
// between 0 and 100 (stationary, 100% zero/small motion).
rc->gfu_boost =
- VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
- (rc->avg_frame_low_motion + 100));
+ VPXMAX(500,
+ DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+ (rc->avg_frame_low_motion + 100));
rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
}
adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -1565,12 +1592,10 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
cpi->refresh_golden_frame = 1;
rc->source_alt_ref_pending = 0;
rc->alt_ref_gf_group = 0;
-#if USE_ALTREF_FOR_ONE_PASS
- if (cpi->oxcf.enable_auto_arf) {
+ if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
rc->source_alt_ref_pending = 1;
rc->alt_ref_gf_group = 1;
}
-#endif
}
if (cm->frame_type == KEY_FRAME)
target = calc_iframe_target_size_one_pass_vbr(cpi);
@@ -1847,6 +1872,26 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
// Clamp min to max
rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
+
+ if (oxcf->target_level == LEVEL_AUTO) {
+ const uint32_t pic_size = cpi->common.width * cpi->common.height;
+ const uint32_t pic_breadth =
+ VPXMAX(cpi->common.width, cpi->common.height);
+ int i;
+ for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+ if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+ vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+ if (rc->min_gf_interval <=
+ (int)vp9_level_defs[i].min_altref_distance) {
+ rc->min_gf_interval =
+ (int)vp9_level_defs[i].min_altref_distance + 1;
+ rc->max_gf_interval =
+ VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
+ }
+ break;
+ }
+ }
+ }
}
}
@@ -1933,9 +1978,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
else
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
- // Correction to rate target based on prior over or under shoot.
- if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
- vbr_rate_correction(cpi, &target_rate);
+ if (!cpi->oxcf.vbr_corpus_complexity) {
+ // Correction to rate target based on prior over or under shoot.
+ if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+ vbr_rate_correction(cpi, &target_rate);
+ }
vp9_rc_set_frame_target(cpi, target_rate);
}
@@ -2070,7 +2117,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
return resize_action;
}
-void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+ uint64_t avg_sad_current) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
int target;
@@ -2081,7 +2129,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
uint64_t avg_source_sad_lag = avg_sad_current;
int high_source_sad_lagindex = -1;
int steady_sad_lagindex = -1;
- uint32_t sad_thresh1 = 60000;
+ uint32_t sad_thresh1 = 70000;
uint32_t sad_thresh2 = 120000;
int low_content = 0;
int high_content = 0;
@@ -2185,11 +2233,16 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
rc->af_ratio_onepass_vbr = 5;
rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
}
-#if USE_ALTREF_FOR_ONE_PASS
- if (cpi->oxcf.enable_auto_arf) {
- // Don't use alt-ref if there is a scene cut within the group,
- // or content is not low.
- if ((rc->high_source_sad_lagindex > 0 &&
+ if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
+ // Flag to disable usage of ARF based on past usage, only allow this
+ // disabling if current frame/group does not start with key frame or
+ // scene cut. Note perc_arf_usage is only computed for speed >= 5.
+ int arf_usage_low =
+ (cm->frame_type != KEY_FRAME && !rc->high_source_sad &&
+ cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5);
+ // Don't use alt-ref for this group under certain conditions.
+ if (arf_usage_low ||
+ (rc->high_source_sad_lagindex > 0 &&
rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
(avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
rc->source_alt_ref_pending = 0;
@@ -2198,12 +2251,12 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
rc->source_alt_ref_pending = 1;
rc->alt_ref_gf_group = 1;
// If alt-ref is used for this gf group, limit the interval.
- if (rc->baseline_gf_interval > 10 &&
- rc->baseline_gf_interval < rc->frames_to_key)
- rc->baseline_gf_interval = 10;
+ if (rc->baseline_gf_interval > 12) {
+ rc->baseline_gf_interval = 12;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ }
}
}
-#endif
target = calc_pframe_target_size_one_pass_vbr(cpi);
vp9_rc_set_frame_target(cpi, target);
}
@@ -2233,11 +2286,14 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
int start_frame = 0;
int frames_to_buffer = 1;
int frame = 0;
+ int scene_cut_force_key_frame = 0;
uint64_t avg_sad_current = 0;
uint32_t min_thresh = 4000;
float thresh = 8.0f;
+ uint32_t thresh_key = 140000;
+ if (cpi->oxcf.speed <= 5) thresh_key = 240000;
if (cpi->oxcf.rc_mode == VPX_VBR) {
- min_thresh = 60000;
+ min_thresh = 65000;
thresh = 2.1f;
}
if (cpi->oxcf.lag_in_frames > 0) {
@@ -2263,6 +2319,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
rc->high_source_sad = 1;
else
rc->high_source_sad = 0;
+ if (rc->high_source_sad && avg_sad_current > thresh_key)
+ scene_cut_force_key_frame = 1;
// Update recursive average for current frame.
if (avg_sad_current > 0)
rc->avg_source_sad[0] =
@@ -2323,6 +2381,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
rc->high_source_sad = 1;
else
rc->high_source_sad = 0;
+ if (rc->high_source_sad && avg_sad > thresh_key)
+ scene_cut_force_key_frame = 1;
if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2;
} else {
@@ -2330,6 +2390,23 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
}
}
}
+ // For CBR non-screen content mode, check if we should reset the rate
+ // control. Reset is done if high_source_sad is detected and the rate
+ // control is at very low QP with rate correction factor at min level.
+ if (cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+ if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality &&
+ rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) &&
+ rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) {
+ rc->rate_correction_factors[INTER_NORMAL] = 0.5;
+ rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+ rc->buffer_level = rc->optimal_buffer_level;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->reset_high_source_sad = 1;
+ }
+ if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
+ rc->this_frame_target = rc->avg_frame_bandwidth;
+ }
// For VBR, under scene change/high content change, force golden refresh.
if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
rc->high_source_sad && rc->frames_to_key > 3 &&
@@ -2337,10 +2414,10 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
cpi->ext_refresh_frame_flags_pending == 0) {
int target;
cpi->refresh_golden_frame = 1;
+ if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME;
rc->source_alt_ref_pending = 0;
-#if USE_ALTREF_FOR_ONE_PASS
- if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1;
-#endif
+ if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf)
+ rc->source_alt_ref_pending = 1;
rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
rc->baseline_gf_interval =
VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index 9e4623195..c1b210677 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -32,6 +32,8 @@ extern "C" {
#define FIXED_GF_INTERVAL 8 // Used in some testing modes only
#define ONEHALFONLY_RESIZE 0
+#define FRAME_OVERHEAD_BITS 200
+
typedef enum {
INTER_NORMAL = 0,
INTER_HIGH = 1,
@@ -150,6 +152,8 @@ typedef struct {
int rc_2_frame;
int q_1_frame;
int q_2_frame;
+ // Keep track of the last target average frame bandwidth.
+ int last_avg_frame_bandwidth;
// Auto frame-scaling variables.
FRAME_SCALE_LEVEL frame_size_selector;
@@ -164,11 +168,14 @@ typedef struct {
uint64_t prev_avg_source_sad_lag;
int high_source_sad_lagindex;
int alt_ref_gf_group;
+ int last_frame_is_src_altref;
int high_source_sad;
int count_last_scene_change;
int avg_frame_low_motion;
int af_ratio_onepass_vbr;
int force_qpmin;
+ int reset_high_source_sad;
+ double perc_arf_usage;
} RATE_CONTROL;
struct VP9_COMP;
diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c
index 39a7742f0..6b2306ce9 100644
--- a/libvpx/vp9/encoder/vp9_rd.c
+++ b/libvpx/vp9/encoder/vp9_rd.c
@@ -670,19 +670,21 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
}
}
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
- vpx_bit_depth_t bit_depth) {
- const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
-#if CONFIG_VP9_HIGHBITDEPTH
- switch (bit_depth) {
- case VPX_BITS_8: return 20 * q;
- case VPX_BITS_10: return 5 * q;
- case VPX_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
- default:
- assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
- return -1;
- }
-#else
- return 20 * q;
-#endif // CONFIG_VP9_HIGHBITDEPTH
+int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
+ int qindex, int qdelta) {
+ // Reduce the intra cost penalty for small blocks (<=16x16).
+ int reduction_fac =
+ (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
+ // Don't reduce intra cost penalty if estimated noise level is high.
+ reduction_fac = 0;
+
+ // Always use VPX_BITS_8 as input here because the penalty is applied
+ // to rate not distortion so we want a consistent penalty for all bit
+ // depths. If the actual bit depth were passed in here then the value
+ // retured by vp9_dc_quant() would scale with the bit depth and we would
+ // then need to apply inverse scaling to correct back to a bit depth
+ // independent rate penalty.
+ return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
}
diff --git a/libvpx/vp9/encoder/vp9_rd.h b/libvpx/vp9/encoder/vp9_rd.h
index 1e1176866..59022c106 100644
--- a/libvpx/vp9/encoder/vp9_rd.h
+++ b/libvpx/vp9/encoder/vp9_rd.h
@@ -191,13 +191,18 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
const struct scale_factors *scale,
const struct scale_factors *scale_uv);
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
- vpx_bit_depth_t bit_depth);
+int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi,
+ BLOCK_SIZE bsize, int qindex, int qdelta);
+unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi,
+ const struct buf_2d *ref, BLOCK_SIZE bs);
unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi,
const struct buf_2d *ref,
BLOCK_SIZE bs);
#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi,
+ const struct buf_2d *ref, BLOCK_SIZE bs,
+ int bd);
unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
const struct buf_2d *ref,
BLOCK_SIZE bs, int bd);
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index bf0fec3d8..2ba6378c5 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -600,7 +600,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
- 32, NULL, 0, NULL, 0, bs, bs, xd->bd);
+ 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
} else {
@@ -623,7 +623,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
recon = CONVERT_TO_BYTEPTR(recon16);
} else {
#endif // CONFIG_VP9_HIGHBITDEPTH
- vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
+ vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
switch (tx_size) {
case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
@@ -632,7 +632,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
// this is like vp9_short_idct4x4 but has a special case around
// eob<=1, which is significant (not just an optimization) for
// the lossless case.
- x->itxm_add(dqcoeff, recon, 32, *eob);
+ x->inv_txfm_add(dqcoeff, recon, 32, *eob);
break;
default: assert(0 && "Invalid transform size"); break;
}
@@ -730,7 +730,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
}
} else {
// SKIP_TXFM_AC_DC
- // skip forward transform
+ // skip forward transform. Because this is handled here, the quantization
+ // does not need to do it.
x->plane[plane].eobs[block] = 0;
sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
dist = sse;
@@ -1576,8 +1577,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
k += (idy * 2 + idx);
coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
coeff = BLOCK_OFFSET(p->coeff, k);
- x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
- coeff, 8);
+ x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+ coeff, 8);
vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
#if CONFIG_VP9_HIGHBITDEPTH
thisdistortion += vp9_highbd_block_error_dispatch(
@@ -2875,57 +2876,82 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
// This function is designed to apply a bias or adjustment to an rd value based
// on the relative variance of the source and reconstruction.
-#define LOW_VAR_THRESH 16
-#define VLOW_ADJ_MAX 25
-#define VHIGH_ADJ_MAX 8
+#define VERY_LOW_VAR_THRESH 2
+#define LOW_VAR_THRESH 5
+#define VAR_MULT 100
+static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 100 };
+
static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, int64_t *this_rd,
MV_REFERENCE_FRAME ref_frame,
unsigned int source_variance) {
MACROBLOCKD *const xd = &x->e_mbd;
- unsigned int recon_variance;
+ unsigned int rec_variance;
+ unsigned int src_variance;
+ unsigned int src_rec_min;
unsigned int absvar_diff = 0;
- int64_t var_error = 0;
- int64_t var_factor = 0;
+ unsigned int var_factor = 0;
+ unsigned int adj_max;
+ vp9e_tune_content content_type = cpi->oxcf.content;
if (*this_rd == INT64_MAX) return;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- recon_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
+ if (source_variance > 0) {
+ rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
bsize, xd->bd);
+ src_variance = source_variance;
+ } else {
+ rec_variance =
+ vp9_high_get_sby_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+ src_variance =
+ vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+ }
} else {
- recon_variance =
- vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+ if (source_variance > 0) {
+ rec_variance =
+ vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+ src_variance = source_variance;
+ } else {
+ rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
+ src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+ }
}
#else
- recon_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+ if (source_variance > 0) {
+ rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+ src_variance = source_variance;
+ } else {
+ rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
+ src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+ }
#endif // CONFIG_VP9_HIGHBITDEPTH
- if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
- absvar_diff = (source_variance > recon_variance)
- ? (source_variance - recon_variance)
- : (recon_variance - source_variance);
+ // Lower of source (raw per pixel value) and recon variance. Note that
+ // if the source per pixel is 0 then the recon value here will not be per
+ // pixel (see above) so will likely be much larger.
+ src_rec_min = VPXMIN(source_variance, rec_variance);
- var_error = ((int64_t)200 * source_variance * recon_variance) /
- (((int64_t)source_variance * source_variance) +
- ((int64_t)recon_variance * recon_variance));
- var_error = 100 - var_error;
- }
+ if (src_rec_min > LOW_VAR_THRESH) return;
+
+ absvar_diff = (src_variance > rec_variance) ? (src_variance - rec_variance)
+ : (rec_variance - src_variance);
+
+ adj_max = max_var_adjust[content_type];
+
+ var_factor =
+ (unsigned int)((int64_t)VAR_MULT * absvar_diff) / VPXMAX(1, src_variance);
+ var_factor = VPXMIN(adj_max, var_factor);
- // Source variance above a threshold and ref frame is intra.
- // This case is targeted mainly at discouraging intra modes that give rise
- // to a predictor with a low spatial complexity compared to the source.
- if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
- (source_variance > recon_variance)) {
- var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
- // A second possible case of interest is where the source variance
- // is very low and we wish to discourage false texture or motion trails.
- } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
- (recon_variance > source_variance)) {
- var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
- }
*this_rd += (*this_rd * var_factor) / 100;
+
+ if (content_type == VP9E_CONTENT_FILM) {
+ if (src_rec_min <= VERY_LOW_VAR_THRESH) {
+ if (ref_frame == INTRA_FRAME) *this_rd *= 2;
+ if (bsize > 6) *this_rd *= 2;
+ }
+ }
}
// Do we have an internal image edge (e.g. formatting bars).
@@ -3037,8 +3063,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
int64_t dist_uv[TX_SIZES];
int skip_uv[TX_SIZES];
PREDICTION_MODE mode_uv[TX_SIZES];
- const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ const int intra_cost_penalty =
+ vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
int best_skip2 = 0;
uint8_t ref_frame_skip_mask[2] = { 0 };
uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
@@ -3801,8 +3827,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
int64_t dist_uv;
int skip_uv;
PREDICTION_MODE mode_uv = DC_PRED;
- const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ const int intra_cost_penalty =
+ vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
int_mv seg_mvs[4][MAX_REF_FRAMES];
b_mode_info best_bmodes[4];
int best_skip2 = 0;
diff --git a/libvpx/vp9/encoder/vp9_skin_detection.c b/libvpx/vp9/encoder/vp9_skin_detection.c
index 3f3d48fb9..cc6c96776 100644
--- a/libvpx/vp9/encoder/vp9_skin_detection.c
+++ b/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -15,75 +15,6 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_skin_detection.h"
-#define MODEL_MODE 1
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
- { 6400, 10240 },
- { 7040, 10240 },
- { 8320, 9280 },
- { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
- 800000, 800000, 800000 }; // q18
-
-// Thresholds on luminance.
-static const int y_low = 40;
-static const int y_high = 220;
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
- const int cb_q6 = cb << 6;
- const int cr_q6 = cr << 6;
- const int cb_diff_q12 =
- (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
- const int cbcr_diff_q12 =
- (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
- const int cr_diff_q12 =
- (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
- const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
- const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
- const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
- const int skin_diff =
- skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
- skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
- return skin_diff;
-}
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
- int motion) {
- if (y < y_low || y > y_high) {
- return 0;
- } else {
- if (MODEL_MODE == 0) {
- return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
- } else {
- int i = 0;
- // Exit on grey.
- if (cb == 128 && cr == 128) return 0;
- // Exit on very strong cb.
- if (cb > 150 && cr < 110) return 0;
- for (; i < 5; i++) {
- int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
- if (skin_color_diff < skin_threshold[i + 1]) {
- if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
- return 0;
- else if (motion == 0 &&
- skin_color_diff > (skin_threshold[i + 1] >> 1))
- return 0;
- else
- return 1;
- }
- // Exit if difference is much large than the threshold.
- if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
- return 0;
- }
- }
- return 0;
- }
- }
-}
-
int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
int stride, int strideuv, int bsize,
int consec_zeromv, int curr_motion_magn) {
@@ -100,31 +31,113 @@ int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+
if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
- return vp9_skin_pixel(ysource, usource, vsource, motion);
+ return vpx_skin_pixel(ysource, usource, vsource, motion);
+ }
+}
+
+void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ int i, j, num_bl;
+ VP9_COMMON *const cm = &cpi->common;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const int src_uvstride = cpi->Source->uv_stride;
+ const int y_bsize = 4 << b_width_log2_lookup[bsize];
+ const int uv_bsize = y_bsize >> 1;
+ const int shy = (y_bsize == 8) ? 3 : 4;
+ const int shuv = shy - 1;
+ const int fac = y_bsize / 8;
+ const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3);
+ const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2);
+ const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2);
+ const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2);
+ src_y += y_shift;
+ src_u += uv_shift;
+ src_v += uv_shift;
+
+ for (i = mi_row; i < mi_row_limit; i += fac) {
+ num_bl = 0;
+ for (j = mi_col; j < mi_col_limit; j += fac) {
+ int consec_zeromv = 0;
+ int bl_index = i * cm->mi_cols + j;
+ int bl_index1 = bl_index + 1;
+ int bl_index2 = bl_index + cm->mi_cols;
+ int bl_index3 = bl_index2 + 1;
+ // Don't detect skin on the boundary.
+ if (i == 0 || j == 0) continue;
+ if (bsize == BLOCK_8X8)
+ consec_zeromv = cpi->consec_zero_mv[bl_index];
+ else
+ consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+ VPXMIN(cpi->consec_zero_mv[bl_index1],
+ VPXMIN(cpi->consec_zero_mv[bl_index2],
+ cpi->consec_zero_mv[bl_index3])));
+ cpi->skin_map[bl_index] =
+ vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+ bsize, consec_zeromv, 0);
+ num_bl++;
+ src_y += y_bsize;
+ src_u += uv_bsize;
+ src_v += uv_bsize;
+ }
+ src_y += (src_ystride << shy) - (num_bl << shy);
+ src_u += (src_uvstride << shuv) - (num_bl << shuv);
+ src_v += (src_uvstride << shuv) - (num_bl << shuv);
+ }
+
+ // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+ // non-skin blocks (all of its neighbors are skin).
+ // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin
+ // blocks. Skip superblock borders to remove isolated non-skin blocks.
+ for (i = mi_row; i < mi_row_limit; i += fac) {
+ for (j = mi_col; j < mi_col_limit; j += fac) {
+ int bl_index = i * cm->mi_cols + j;
+ int num_neighbor = 0;
+ int mi, mj;
+ int non_skin_threshold = 8;
+ // Skip 4 corners.
+ if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) ||
+ (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac)))
+ continue;
+ // There are only 5 neighbors for non-skin blocks on the border.
+ if (i == mi_row || i == mi_row_limit - fac || j == mi_col ||
+ j == mi_col_limit - fac)
+ non_skin_threshold = 5;
+
+ for (mi = -fac; mi <= fac; mi += fac) {
+ for (mj = -fac; mj <= fac; mj += fac) {
+ if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col &&
+ j + mj < mi_col_limit) {
+ int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj;
+ if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+ }
+ }
+ }
+
+ if (cpi->skin_map[bl_index] && num_neighbor < 2)
+ cpi->skin_map[bl_index] = 0;
+ if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+ cpi->skin_map[bl_index] = 1;
+ }
}
}
#ifdef OUTPUT_YUV_SKINMAP
// For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
int i, j, mi_row, mi_col, num_bl;
VP9_COMMON *const cm = &cpi->common;
uint8_t *y;
const uint8_t *src_y = cpi->Source->y_buffer;
- const uint8_t *src_u = cpi->Source->u_buffer;
- const uint8_t *src_v = cpi->Source->v_buffer;
const int src_ystride = cpi->Source->y_stride;
- const int src_uvstride = cpi->Source->uv_stride;
- int y_bsize = 16; // Use 8x8 or 16x16.
- int uv_bsize = y_bsize >> 1;
- int ypos = y_bsize >> 1;
- int uvpos = uv_bsize >> 1;
- int shy = (y_bsize == 8) ? 3 : 4;
- int shuv = shy - 1;
- int fac = y_bsize / 8;
- // Use center pixel or average of center 2x2 pixels.
- int mode_filter = 0;
+ const int y_bsize = 16; // Use 8x8 or 16x16.
+ const int shy = (y_bsize == 8) ? 3 : 4;
+ const int fac = y_bsize / 8;
+
YV12_BUFFER_CONFIG skinmap;
memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x,
@@ -141,65 +154,21 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
num_bl = 0;
for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
- int is_skin = 0;
- if (mode_filter == 1) {
- // Use 2x2 average at center.
- uint8_t ysource = src_y[ypos * src_ystride + ypos];
- uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
- uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
- uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
- uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
- uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
- uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
- uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)];
- uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)];
- uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
- uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)];
- uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)];
- ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
- usource = (usource + usource2 + usource3 + usource4) >> 2;
- vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
- is_skin = vp9_skin_pixel(ysource, usource, vsource, 1);
- } else {
- int block_size = BLOCK_8X8;
- int consec_zeromv = 0;
- int bl_index = mi_row * cm->mi_cols + mi_col;
- int bl_index1 = bl_index + 1;
- int bl_index2 = bl_index + cm->mi_cols;
- int bl_index3 = bl_index2 + 1;
- if (y_bsize == 8)
- consec_zeromv = cpi->consec_zero_mv[bl_index];
- else
- consec_zeromv =
- VPXMIN(cpi->consec_zero_mv[bl_index],
- VPXMIN(cpi->consec_zero_mv[bl_index1],
- VPXMIN(cpi->consec_zero_mv[bl_index2],
- cpi->consec_zero_mv[bl_index3])));
- if (y_bsize == 16) block_size = BLOCK_16X16;
- is_skin =
- vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
- src_uvstride, block_size, consec_zeromv, 0);
- }
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ const int is_skin = cpi->skin_map[block_index];
for (i = 0; i < y_bsize; i++) {
for (j = 0; j < y_bsize; j++) {
- if (is_skin)
- y[i * src_ystride + j] = 255;
- else
- y[i * src_ystride + j] = src_y[i * src_ystride + j];
+ y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
}
}
num_bl++;
y += y_bsize;
src_y += y_bsize;
- src_u += uv_bsize;
- src_v += uv_bsize;
}
y += (src_ystride << shy) - (num_bl << shy);
src_y += (src_ystride << shy) - (num_bl << shy);
- src_u += (src_uvstride << shuv) - (num_bl << shuv);
- src_v += (src_uvstride << shuv) - (num_bl << shuv);
}
- vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+ vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
vpx_free_frame_buffer(&skinmap);
}
#endif
diff --git a/libvpx/vp9/encoder/vp9_skin_detection.h b/libvpx/vp9/encoder/vp9_skin_detection.h
index c77382dbd..8880bff46 100644
--- a/libvpx/vp9/encoder/vp9_skin_detection.h
+++ b/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -12,6 +12,8 @@
#define VP9_ENCODER_VP9_SKIN_MAP_H_
#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
#ifdef __cplusplus
extern "C" {
@@ -19,19 +21,16 @@ extern "C" {
struct VP9_COMP;
-// #define OUTPUT_YUV_SKINMAP
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
- int motion);
-
int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
int stride, int strideuv, int bsize,
int consec_zeromv, int curr_motion_magn);
+void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col);
+
#ifdef OUTPUT_YUV_SKINMAP
// For viewing skin map on input source.
-void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
-extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
+void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
#endif
#ifdef __cplusplus
diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c
index 8d9e2e8c3..a05db60c6 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/libvpx/vp9/encoder/vp9_speed_features.c
@@ -157,6 +157,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
VP9_COMMON *cm,
SPEED_FEATURES *sf,
int speed) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
const int boosted = frame_is_boosted(cpi);
int i;
@@ -182,7 +183,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
}
if (speed >= 1) {
- if (cpi->oxcf.pass == 2) {
+ if (oxcf->pass == 2) {
TWO_PASS *const twopass = &cpi->twopass;
if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) ||
vp9_internal_image_edge(cpi)) {
@@ -225,12 +226,16 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
}
if (speed >= 2) {
- sf->recode_loop = ALLOW_RECODE_KFARFGF;
+ if (oxcf->vbr_corpus_complexity)
+ sf->recode_loop = ALLOW_RECODE_FIRST;
+ else
+ sf->recode_loop = ALLOW_RECODE_KFARFGF;
+
sf->tx_size_search_method =
frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
// Reference masking is not supported in dynamic scaling mode.
- sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
+ sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0;
sf->mode_search_skip_flags =
(cm->frame_type == KEY_FRAME)
@@ -240,7 +245,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->disable_filter_search_var_thresh = 100;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
- sf->allow_partition_search_skip = 1;
sf->recode_tolerance_low = 15;
sf->recode_tolerance_high = 45;
@@ -271,6 +275,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
sf->adaptive_interp_filter_search = 1;
+ sf->allow_partition_search_skip = 1;
if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -364,6 +369,11 @@ static void set_rt_speed_feature_framesize_independent(
sf->copy_partition_flag = 0;
sf->use_source_sad = 0;
sf->use_simple_block_yrd = 0;
+ sf->adapt_partition_source_sad = 0;
+ sf->use_altref_onepass = 0;
+ sf->use_compound_nonrd_pickmode = 0;
+ sf->nonrd_keyframe = 0;
+ sf->svc_use_lowres_part = 0;
if (speed >= 1) {
sf->allow_txfm_domain_distortion = 1;
@@ -441,6 +451,8 @@ static void set_rt_speed_feature_framesize_independent(
if (speed >= 4) {
int i;
+ if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0)
+ sf->use_altref_onepass = 1;
sf->last_partitioning_redo_frequency = 4;
sf->adaptive_rd_thresh = 5;
sf->use_fast_coef_costing = 0;
@@ -466,6 +478,7 @@ static void set_rt_speed_feature_framesize_independent(
}
if (speed >= 5) {
+ sf->use_altref_onepass = 0;
sf->use_quant_fp = !is_keyframe;
sf->auto_min_max_partition_size =
is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
@@ -521,12 +534,30 @@ static void set_rt_speed_feature_framesize_independent(
}
if (speed >= 6) {
+ if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) {
+ sf->use_altref_onepass = 1;
+ sf->use_compound_nonrd_pickmode = 1;
+ }
sf->partition_search_type = VAR_BASED_PARTITION;
// Turn on this to use non-RD key frame coding mode.
sf->use_nonrd_pick_mode = 1;
sf->mv.search_method = NSTEP;
sf->mv.reduce_first_step_size = 1;
sf->skip_encode_sb = 0;
+
+ if (!cpi->external_resize) sf->use_source_sad = 1;
+
+ if (sf->use_source_sad) {
+ sf->adapt_partition_source_sad = 1;
+ sf->adapt_partition_thresh =
+ (cm->width * cm->height <= 640 * 360) ? 40000 : 60000;
+ if (cpi->content_state_sb_fd == NULL &&
+ (!cpi->use_svc ||
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+ cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
+ (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
+ }
+ }
if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
// Enable short circuit for low temporal variance.
sf->short_circuit_low_temp_var = 1;
@@ -534,53 +565,64 @@ static void set_rt_speed_feature_framesize_independent(
if (cpi->svc.temporal_layer_id > 0) {
sf->adaptive_rd_thresh = 4;
sf->limit_newmv_early_exit = 0;
- sf->mv.subpel_force_stop = (cpi->svc.temporal_layer_id == 1) ? 1 : 2;
- sf->base_mv_aggressive =
- (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
- ? 1
- : 0;
+ sf->base_mv_aggressive = 1;
}
}
if (speed >= 7) {
+ sf->adapt_partition_source_sad = 0;
sf->adaptive_rd_thresh = 3;
sf->mv.search_method = FAST_DIAMOND;
sf->mv.fullpel_search_step_param = 10;
+ // For SVC: use better mv search on base temporal layer, and only
+ // on base spatial layer if highest resolution is above 640x360.
if (cpi->svc.number_temporal_layers > 2 &&
- cpi->svc.temporal_layer_id == 0) {
+ cpi->svc.temporal_layer_id == 0 &&
+ (cpi->svc.spatial_layer_id == 0 ||
+ cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) {
sf->mv.search_method = NSTEP;
sf->mv.fullpel_search_step_param = 6;
}
- if (!cpi->external_resize) sf->use_source_sad = 1;
- if (sf->use_source_sad) {
- if (cpi->content_state_sb_fd == NULL &&
- (!cpi->use_svc ||
- cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
- cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
- (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
- }
+ if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) {
+ sf->use_simple_block_yrd = 1;
+ if (cpi->svc.non_reference_frame)
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
}
- }
-
- if (speed >= 8) {
- sf->adaptive_rd_thresh = 4;
- // Enable partition copy. For SVC, only enabled for top resolution layer,
+ if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
+ sf->adaptive_rd_thresh_row_mt = 1;
+ // Enable partition copy. For SVC only enabled for top spatial resolution
+ // layer.
+ cpi->max_copied_frame = 0;
if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
!cpi->external_resize &&
(!cpi->use_svc ||
cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
sf->copy_partition_flag = 1;
- cpi->max_copied_frame = 4;
+ cpi->max_copied_frame = 2;
+ // The top temporal enhancement layer (for number of temporal layers > 1)
+ // are non-reference frames, so use large/max value for max_copied_frame.
+ if (cpi->svc.number_temporal_layers > 1 &&
+ cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+ cpi->max_copied_frame = 255;
}
+ // For SVC: enable use of lower resolution partition for higher resolution,
+ // only for 3 spatial layers and when config/top resolution is above VGA.
+ // Enable only for non-base temporal layer frames.
+ if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 &&
+ cpi->svc.temporal_layer_id > 0 &&
+ cpi->oxcf.width * cpi->oxcf.height > 640 * 480)
+ sf->svc_use_lowres_part = 1;
+ }
+ if (speed >= 8) {
+ sf->adaptive_rd_thresh = 4;
+ sf->skip_encode_sb = 1;
+ sf->nonrd_keyframe = 1;
+ if (!cpi->use_svc) cpi->max_copied_frame = 4;
if (cpi->row_mt && cpi->oxcf.max_threads > 1)
sf->adaptive_rd_thresh_row_mt = 1;
- if (content == VP9E_CONTENT_SCREEN)
- sf->mv.subpel_force_stop = 3;
- else if (cm->width * cm->height > 352 * 288)
- sf->mv.subpel_force_stop = 2;
-
+ if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3;
if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
// Only keep INTRA_DC mode for speed 8.
if (!is_keyframe) {
@@ -610,6 +652,20 @@ static void set_rt_speed_feature_framesize_independent(
sf->limit_newmv_early_exit = 0;
sf->use_simple_block_yrd = 1;
}
+ if (sf->use_altref_onepass) {
+ if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) {
+ sf->partition_search_type = FIXED_PARTITION;
+ sf->always_this_block_size = BLOCK_64X64;
+ }
+ if (cpi->count_arf_frame_usage == NULL)
+ cpi->count_arf_frame_usage =
+ (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+ sizeof(*cpi->count_arf_frame_usage));
+ if (cpi->count_lastgolden_frame_usage == NULL)
+ cpi->count_lastgolden_frame_usage =
+ (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+ sizeof(*cpi->count_lastgolden_frame_usage));
+ }
}
void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
@@ -651,7 +707,8 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
// and multiple threads match.
// It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
// adaptive_rd_thresh is defined per-row for non-rd pickmode.
- if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact)
+ if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+ oxcf->max_threads > 1)
sf->adaptive_rd_thresh = 0;
// This is only used in motion vector unit test.
@@ -768,7 +825,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
else if (oxcf->mode == GOOD)
set_good_speed_feature_framesize_independent(cpi, cm, sf, oxcf->speed);
- cpi->full_search_sad = vp9_full_search_sad;
cpi->diamond_search_sad = vp9_diamond_search_sad;
// Slow quant, dct and trellis not worthwhile for first pass
@@ -808,7 +864,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
// and multiple threads match.
// It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
// adaptive_rd_thresh is defined per-row for non-rd pickmode.
- if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact)
+ if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+ oxcf->max_threads > 1)
sf->adaptive_rd_thresh = 0;
// This is only used in motion vector unit test.
diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h
index ee485a35f..50d52bc23 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/libvpx/vp9/encoder/vp9_speed_features.h
@@ -490,6 +490,24 @@ typedef struct SPEED_FEATURES {
int use_source_sad;
int use_simple_block_yrd;
+
+ // If source sad of superblock is high (> adapt_partition_thresh), will switch
+ // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition
+ // based on the nonrd-pickmode).
+ int adapt_partition_source_sad;
+ int adapt_partition_thresh;
+
+ // Enable use of alt-refs in 1 pass VBR.
+ int use_altref_onepass;
+
+ // Enable use of compound prediction, for nonrd_pickmode with nonzero lag.
+ int use_compound_nonrd_pickmode;
+
+ // Always use nonrd_pick_intra for all block sizes on keyframes.
+ int nonrd_keyframe;
+
+ // For SVC: enables use of partition from lower spatial resolution.
+ int svc_use_lowres_part;
} SPEED_FEATURES;
struct VP9_COMP;
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 5867a6c38..2636bd9a5 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -36,6 +36,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
svc->scaled_temp_is_alloc = 0;
svc->scaled_one_half = 0;
svc->current_superframe = 0;
+ svc->non_reference_frame = 0;
+
for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
svc->ext_frame_flags[sl] = 0;
@@ -173,7 +175,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
RATE_CONTROL *const lrc = &lc->rc;
lc->spatial_layer_target_bandwidth = spatial_layer_target;
- bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target;
+ bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
lrc->starting_buffer_level =
(int64_t)(rc->starting_buffer_level * bitrate_alloc);
lrc->optimal_buffer_level =
@@ -351,6 +353,7 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
}
}
+#if !CONFIG_REALTIME_ONLY
void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
SVC *const svc = &cpi->svc;
int i;
@@ -366,6 +369,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
}
svc->spatial_layer_id = 0;
}
+#endif // !CONFIG_REALTIME_ONLY
void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
LAYER_CONTEXT *const lc =
@@ -386,9 +390,9 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
.is_key_frame;
}
-static void get_layer_resolution(const int width_org, const int height_org,
- const int num, const int den, int *width_out,
- int *height_out) {
+void get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out) {
int w, h;
if (width_out == NULL || height_out == NULL || den == 0) return;
@@ -603,6 +607,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
LAYER_CONTEXT *lc = NULL;
if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
cpi->svc.force_zero_mode_spatial_ref = 1;
+ cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride;
if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
set_flags_and_fb_idx_for_temporal_mode3(cpi);
@@ -652,9 +657,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
lc->scaling_factor_num, lc->scaling_factor_den, &width,
&height);
- // For low resolutions: set phase of the filter = 8 (for symmetric averaging
- // filter), use bilinear for now.
- if (width <= 320 && height <= 240) {
+ // For resolutions <= VGA: set phase of the filter = 8 (for symmetric
+ // averaging filter), use bilinear for now.
+ if (width * height <= 640 * 480) {
cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
}
@@ -677,6 +682,12 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
}
}
+ cpi->svc.non_reference_frame = 0;
+ if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
+ !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) {
+ cpi->svc.non_reference_frame = 1;
+ }
+
if (vp9_set_size_literal(cpi, width, height) != 0)
return VPX_CODEC_INVALID_PARAM;
@@ -851,3 +862,28 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
vp9_update_temporal_layer_framerate(cpi);
vp9_restore_layer_context(cpi);
}
+
+void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
+ SVC *svc = &cpi->svc;
+ int sl, tl;
+ for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+ int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
+ lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
+ // Reset for all temporal layers with spatial layer sl.
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->rc_1_frame = 0;
+ lrc->rc_2_frame = 0;
+ lrc->bits_off_target = lrc->optimal_buffer_level;
+ lrc->buffer_level = lrc->optimal_buffer_level;
+ }
+ }
+ }
+}
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h
index d8e6772b2..b7cdfd962 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -49,7 +49,7 @@ typedef struct {
uint8_t speed;
} LAYER_CONTEXT;
-typedef struct {
+typedef struct SVC {
int spatial_layer_id;
int temporal_layer_id;
int number_spatial_layers;
@@ -87,6 +87,7 @@ typedef struct {
int ref_frame_index[REF_FRAMES];
int force_zero_mode_spatial_ref;
int current_superframe;
+ int non_reference_frame;
int use_base_mv;
// Used to control the downscaling filter for source scaling, for 1 pass CBR.
// downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
@@ -95,6 +96,11 @@ typedef struct {
// eighttap_smooth, eighttap_sharp, and bilinear.
INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS];
int downsample_filter_phase[VPX_SS_MAX_LAYERS];
+
+ BLOCK_SIZE *prev_partition_svc;
+ int mi_stride[VPX_MAX_LAYERS];
+
+ int first_layer_denoise;
} SVC;
struct VP9_COMP;
@@ -124,6 +130,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
// Initialize second pass rc for spatial svc.
void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
+void get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out);
+
// Increment number of video frames in layer
void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
@@ -144,6 +154,8 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
+void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 630794156..2758c42ae 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -350,6 +350,27 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
td->mb.mv_limits.col_max =
((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
+ if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+ unsigned int src_variance;
+ struct buf_2d src;
+
+ src.buf = f->y_buffer + mb_y_offset;
+ src.stride = f->y_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ src_variance =
+ vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd);
+ } else {
+ src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
+ }
+#else
+ src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2);
+ }
+
for (frame = 0; frame < frame_count; frame++) {
const uint32_t thresh_low = 10000;
const uint32_t thresh_high = 20000;
diff --git a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
index be4cd8685..460dab659 100644
--- a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -11,6 +11,7 @@
#include <assert.h>
#include <smmintrin.h>
+#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 09a1e48fc..dbd243ac1 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -15,6 +15,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
@@ -71,7 +72,7 @@ static INLINE void transpose_4x4(__m128i *res) {
}
static void fdct4_sse2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -193,7 +194,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -706,61 +707,9 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
store_output(&res[7], (output + 7 * stride));
}
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
static void fdct8_sse2(__m128i *in) {
// constants
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -895,7 +844,7 @@ static void fdct8_sse2(__m128i *in) {
in[7] = _mm_packs_epi32(v6, v7);
// transpose
- array_transpose_8x8(in, in);
+ transpose_16bit_8x8(in, in);
}
static void fadst8_sse2(__m128i *in) {
@@ -912,7 +861,7 @@ static void fadst8_sse2(__m128i *in) {
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__const_0 = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1125,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) {
in[7] = _mm_sub_epi16(k__const_0, s1);
// transpose
- array_transpose_8x8(in, in);
+ transpose_16bit_8x8(in, in);
}
void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -1182,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
}
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
// perform rounding operations
right_shift_8x8(res0, 2);
@@ -1210,7 +1142,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
static void fdct16_8col(__m128i *in) {
// perform 16x16 1-D DCT for 8 columns
__m128i i[8], s[8], p[8], t[8], u[16], v[16];
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -1557,8 +1489,8 @@ static void fadst16_8col(__m128i *in) {
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2002,13 +1934,13 @@ static void fadst16_8col(__m128i *in) {
static void fdct16_sse2(__m128i *in0, __m128i *in1) {
fdct16_8col(in0);
fdct16_8col(in1);
- array_transpose_16x16(in0, in1);
+ transpose_16bit_16x16(in0, in1);
}
static void fadst16_sse2(__m128i *in0, __m128i *in1) {
fadst16_8col(in0);
fadst16_8col(in1);
- array_transpose_16x16(in0, in1);
+ transpose_16bit_16x16(in0, in1);
}
void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
index db57ee1f1..bf874a09e 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -31,7 +31,7 @@ void vp9_fdct8x8_quant_ssse3(
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
diff --git a/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
index 91d0602f9..5930bf491 100644
--- a/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -13,7 +13,6 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
-#include "vpx_ports/emmintrin_compat.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/encoder/vp9_context_tree.h"
diff --git a/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/libvpx/vp9/encoder/x86/vp9_error_avx2.c
index e228bd8b7..99fef31d1 100644
--- a/libvpx/vp9/encoder/x86/vp9_error_avx2.c
+++ b/libvpx/vp9/encoder/x86/vp9_error_avx2.c
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
- * Usee of this source code is governed by a BSD-style license
+ * Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
@@ -105,3 +105,57 @@ int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
_mm_storel_epi64((__m128i *)(ssz), ssz_128);
return sse;
}
+
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, int block_size) {
+ int i;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i sse_256 = zero;
+ __m256i sse_hi;
+ __m128i sse_128;
+ int64_t sse;
+
+ if (block_size == 16) {
+ // Load 16 elements for coeff and dqcoeff.
+ const __m256i _coeff = load_tran_low(coeff);
+ const __m256i _dqcoeff = load_tran_low(dqcoeff);
+ // dqcoeff - coeff
+ const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+ // madd (dqcoeff - coeff)
+ const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+ // Save the higher 64 bit of each 128 bit lane.
+ const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+ // Expand each double word in the lower 64 bits to quad word.
+ sse_256 = _mm256_unpacklo_epi32(error, zero);
+ } else {
+ for (i = 0; i < block_size; i += 16) {
+ // Load 16 elements for coeff and dqcoeff.
+ const __m256i _coeff = load_tran_low(coeff);
+ const __m256i _dqcoeff = load_tran_low(dqcoeff);
+ const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+ const __m256i error = _mm256_madd_epi16(diff, diff);
+ // Expand each double word of madd (dqcoeff - coeff) to quad word.
+ const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+ const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+ // Add each quad word of madd (dqcoeff - coeff).
+ sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+ sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+ coeff += 16;
+ dqcoeff += 16;
+ }
+ }
+ // Save the higher 64 bit of each 128 bit lane.
+ sse_hi = _mm256_srli_si256(sse_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+ // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+ sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+ _mm256_extractf128_si256(sse_256, 1));
+
+ // Store the results.
+ _mm_storel_epi64((__m128i *)&sse, sse_128);
+ return sse;
+}
diff --git a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index b53714a02..7685e7bc3 100644
--- a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -13,159 +13,738 @@
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_scale/yv12config.h"
-extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst,
- uint8_t filter_type, int phase_scaler);
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+ const uint8_t *const src, const __m128i *const mask) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+ const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+ const __m128i a_and = _mm_and_si128(a, *mask);
+ const __m128i b_and = _mm_and_si128(b, *mask);
+ return _mm_packus_epi16(a_and, b_and);
+}
-static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride, int w,
- int h) {
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
const __m128i mask = _mm_set1_epi16(0x00FF);
- const int max_width = w & ~15;
- int y;
- for (y = 0; y < h; ++y) {
- int x;
- for (x = 0; x < max_width; x += 16) {
- const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0));
- const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16));
- const __m128i a_and = _mm_and_si128(a, mask);
- const __m128i b_and = _mm_and_si128(b, mask);
- const __m128i c = _mm_packus_epi16(a_and, b_and);
- _mm_storeu_si128((__m128i *)(dst + x), c);
- }
- for (; x < w; ++x) dst[x] = src[x * 2];
- src += src_stride * 2;
- dst += dst_stride;
- }
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+ _mm_storeu_si128((__m128i *)dst, d);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
}
-static INLINE __m128i filter(const __m128i *const a, const __m128i *const b,
- const __m128i *const c, const __m128i *const d,
- const __m128i *const e, const __m128i *const f,
- const __m128i *const g, const __m128i *const h) {
- const __m128i coeffs_ab =
- _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1);
- const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78,
- -19, 78, -19, 78, -19, 78, -19);
- const __m128i const64_x16 = _mm_set1_epi16(64);
- const __m128i ab = _mm_unpacklo_epi8(*a, *b);
- const __m128i cd = _mm_unpacklo_epi8(*c, *d);
- const __m128i fe = _mm_unpacklo_epi8(*f, *e);
- const __m128i hg = _mm_unpacklo_epi8(*h, *g);
- const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab);
- const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd);
- const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd);
- const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab);
- // can not overflow
- const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms);
- // can not overflow
- const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms);
- // can overflow, use saturating add
- const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms);
- const __m128i round = _mm_adds_epi16(terms, const64_x16);
- const __m128i shift = _mm_srai_epi16(round, 7);
- return _mm_packus_epi16(shift, shift);
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi32(0x000000FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+ const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+ const __m128i d2 = _mm_packus_epi16(d0, d1);
+ _mm_storeu_si128((__m128i *)dst, d2);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
}
-static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) {
- const int max_width = w & ~7;
- int x = 0;
- for (; x < max_width; x += 8) {
- const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0));
- const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1));
- const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2));
- const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3));
- const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4));
- const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5));
- const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6));
- const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7));
- const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h);
- _mm_storel_epi64((__m128i *)(dst + x), pack);
- }
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+ const __m128i c0c1) {
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+ const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+ // round and shift by 7 bit each 16 bit
+ const __m128i t2 = _mm_adds_epi16(t0, k_64);
+ const __m128i t3 = _mm_adds_epi16(t1, k_64);
+ const __m128i t4 = _mm_srai_epi16(t2, 7);
+ const __m128i t5 = _mm_srai_epi16(t3, 7);
+ return _mm_packus_epi16(t4, t5);
}
-static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride, int dst_w,
- int dst_h) {
- dst_w /= 2;
- dst_h /= 2;
- {
- DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]);
- uint8_t *tmp0 = tmp + dst_w * 0;
- uint8_t *tmp1 = tmp + dst_w * 1;
- uint8_t *tmp2 = tmp + dst_w * 2;
- uint8_t *tmp3 = tmp + dst_w * 3;
- uint8_t *tmp4 = tmp + dst_w * 4;
- uint8_t *tmp5 = tmp + dst_w * 5;
- uint8_t *tmp6 = tmp + dst_w * 6;
- uint8_t *tmp7 = tmp + dst_w * 7;
- uint8_t *tmp8 = NULL;
- const int max_width = dst_w & ~7;
- int y;
- eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w);
- eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w);
- eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w);
- eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w);
- eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w);
- eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w);
- eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w);
- for (y = 0; y < dst_h; y++) {
- int x;
- eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w);
- for (x = 0; x < max_width; x += 8) {
- const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x));
- const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
- const __m128i AB = _mm_unpacklo_epi8(A, B);
- __m128i C, D, CD;
- _mm_storeu_si128((__m128i *)(dst + x * 2), AB);
- {
- const __m128i a =
- _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3));
- const __m128i b =
- _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2));
- const __m128i c =
- _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1));
- const __m128i d =
- _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0));
- const __m128i e =
- _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1));
- const __m128i f =
- _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2));
- const __m128i g =
- _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3));
- const __m128i h =
- _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4));
- C = filter(&a, &b, &c, &d, &e, &f, &g, &h);
- }
- {
- const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x));
- const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x));
- const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x));
- const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
- const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x));
- const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x));
- const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x));
- const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x));
- D = filter(&a, &b, &c, &d, &e, &f, &g, &h);
- }
- CD = _mm_unpacklo_epi8(C, D);
- _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD);
- }
- src += src_stride;
- dst += dst_stride * 2;
- tmp8 = tmp0;
- tmp0 = tmp1;
- tmp1 = tmp2;
- tmp2 = tmp3;
- tmp3 = tmp4;
- tmp4 = tmp5;
- tmp5 = tmp6;
- tmp6 = tmp7;
- tmp7 = tmp8;
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[2], d[2];
+
+ // Horizontal
+ // Even rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // odd rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // Vertical
+ s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[8], d[8];
+
+ // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+ // Here we tried to not use shuffle instructions which would be slow
+ // on some x86 CPUs.
+
+ // Horizontal
+ // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx
+ // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx
+ // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx
+ // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx
+ // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx
+ // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx
+ // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx
+ // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx
+ s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+ s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+ s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+ s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+ s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+ s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+ // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx
+ // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx
+ // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx
+ // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx
+ // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx
+ // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx
+ // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx
+ // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx
+ d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+ d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+ d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+ d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+ d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+ d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+ d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+ d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+ // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx
+ // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx
+ // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx
+ // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx
+ // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx
+ // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx
+ // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx
+ // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx
+ s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+ s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+ s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+ s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+ s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+ s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+ // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D
+ // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D
+ // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D
+ // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D
+ d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+ d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+ d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+ d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+ d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+ // Vertical
+ d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 3) & ~3;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 3) & ~3;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+ // horizontal 4x8
+ do {
+ load_8bit_8x8(src + 2, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[3]);
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ transpose_16bit_4x8(&s[3], &s[3]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71
+ d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72
+ d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ d[0] = _mm_packus_epi16(d[0], d[2]);
+ d[1] = _mm_packus_epi16(d[1], d[3]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+ d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+ d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ t += 8;
+ x -= 4;
+ } while (x);
+ src += 8 * src_stride - 2 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x4
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+ t += 6 * width_hor;
+ y = height_ver;
+
+ do {
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17
+ d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27
+ d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[1] = _mm_packus_epi16(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ dst += 4 * dst_stride;
+ y -= 4;
+ } while (y);
+ t -= width_hor * (2 * height_ver + 6);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 1) & ~1;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 1) & ~1;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+ // horizontal 2x8
+ do {
+ load_8bit_8x8(src + 4, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped)
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[2]);
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ transpose_16bit_4x8(&s[2], &s[2]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71
+
+ // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ d[0] = _mm_packus_epi16(d[0], d[0]);
+ d[1] = _mm_packus_epi16(d[1], d[1]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+ store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ t += 4;
+ x -= 2;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x2
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ t += 4 * width_hor;
+ y = height_ver;
+
+ do {
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ t -= width_hor * (4 * height_ver + 4);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+ __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+ const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[12], d[6], dd[4];
+ __m128i f0[4], f1[5], f2[5];
+ // The offset of the first row is always less than 1 pixel.
+ const int offset1_q4 = phase_scaler + 1 * step_q4;
+ const int offset2_q4 = phase_scaler + 2 * step_q4;
+ // offset_idxx indicates the pixel offset is even (0) or odd (1).
+ // It's used to choose the src offset and filter coefficient offset.
+ const int offset_idx1 = (offset1_q4 >> 4) & 1;
+ const int offset_idx2 = (offset2_q4 >> 4) & 1;
+ static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+ shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+ };
+ static const convolve8_funcs convolve8_funcs[2] = {
+ convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+ };
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
+ shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+ shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+ // Sub 64 to avoid overflow.
+ // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+ // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+ // When filter phase idx is 1, the two biggest coefficients are shuffled
+ // together, and the sum of them are always no less than 128. Sub 64 here.
+ // After the subtraction, when the sum of all positive coefficients are no
+ // larger than 128, and the sum of all negative coefficients are no
+ // less than -128, there will be no overflow in the convolve8 functions.
+ f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+ f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+ f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+ // horizontal 6x8
+ do {
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[4]);
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F
+ transpose_16bit_4x8(&s[4], &s[4]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ dd[0] = _mm_packus_epi16(d[0], d[2]);
+ dd[1] = _mm_packus_epi16(d[1], d[3]);
+ dd[2] = _mm_packus_epi16(d[4], d[4]);
+ dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75
+ d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+ d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+ d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx
+ // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx
+ dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+ dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
+ // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
+ // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx
+ // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx
+ d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+ d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+ d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+ d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+ // store 4 extra pixels
+ storeu_8bit_16x4(d, t, stride_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ t += 12;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 3 * stride_hor + 4;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ loadu_8bit_16x4(t, stride_hor, s);
+ y = height_ver;
+
+ do {
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7
+ // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7
+ t += 4 * stride_hor;
+ loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[2] = _mm_packus_epi16(d[2], d[3]);
+ d[4] = _mm_packus_epi16(d[4], d[5]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+ _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+ _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * 2 * height_ver / 3;
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+ const __m128i *const f) {
+ __m128i ss[4], temp;
+
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ temp = convolve8_8_ssse3(ss, f);
+ return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+ const int w, const __m128i *const f) {
+ int x = w;
+
+ do {
+ __m128i s[8], temp;
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+ s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+ temp = scale_1_to_2_phase_0_kernel(s, f);
+ _mm_storel_epi64((__m128i *)dst, temp);
+ src += 8;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int src_w, const int src_h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ int max_width;
+ int y;
+ uint8_t *tmp[9];
+ __m128i f[4];
+
+ max_width = (src_w + 7) & ~7;
+ tmp[0] = temp_buffer + 0 * max_width;
+ tmp[1] = temp_buffer + 1 * max_width;
+ tmp[2] = temp_buffer + 2 * max_width;
+ tmp[3] = temp_buffer + 3 * max_width;
+ tmp[4] = temp_buffer + 4 * max_width;
+ tmp[5] = temp_buffer + 5 * max_width;
+ tmp[6] = temp_buffer + 6 * max_width;
+ tmp[7] = temp_buffer + 7 * max_width;
+
+ shuffle_filter_ssse3(coef, f);
+
+ scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+ scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+ scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+ scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+ scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+ scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+ scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+ y = src_h;
+ do {
+ int x;
+ scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+ for (x = 0; x < max_width; x += 8) {
+ __m128i s[8], C, D, CD;
+
+ // Even rows
+ const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+ const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ const __m128i ab = _mm_unpacklo_epi8(a, b);
+ _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+ // Odd rows
+ // Even columns
+ load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+ C = scale_1_to_2_phase_0_kernel(s, f);
+
+ // Odd columns
+ s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+ s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+ s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+ s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+ s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+ s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+ s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+ D = scale_1_to_2_phase_0_kernel(s, f);
+
+ CD = _mm_unpacklo_epi8(C, D);
+ _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
}
- }
+
+ src += src_stride;
+ dst += 2 * dst_stride;
+ tmp[8] = tmp[0];
+ tmp[0] = tmp[1];
+ tmp[1] = tmp[2];
+ tmp[2] = tmp[3];
+ tmp[3] = tmp[4];
+ tmp[4] = tmp[5];
+ tmp[5] = tmp[6];
+ tmp[6] = tmp[7];
+ tmp[7] = tmp[8];
+ } while (--y);
}
void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
@@ -177,30 +756,152 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
const int dst_h = dst->y_crop_height;
const int dst_uv_w = dst_w / 2;
const int dst_uv_h = dst_h / 2;
+ int scaled = 0;
- if (dst_w * 2 == src_w && dst_h * 2 == src_h && phase_scaler == 0) {
- downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
- dst->y_stride, dst_w, dst_h);
- downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
- dst->uv_stride, dst_uv_w, dst_uv_h);
- downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
- dst->uv_stride, dst_uv_w, dst_uv_h);
- vpx_extend_frame_borders(dst);
- } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
- // The upsample() supports widths up to 1920 * 2. If greater, fall back
- // to vp9_scale_and_extend_frame_c().
- if (dst_w / 2 <= 1920) {
- upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
- dst->y_stride, dst_w, dst_h);
- upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
- dst->uv_stride, dst_uv_w, dst_uv_h);
- upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
- dst->uv_stride, dst_uv_w, dst_uv_h);
- vpx_extend_frame_borders(dst);
+ // phase_scaler is usually 0 or 8.
+ assert(phase_scaler >= 0 && phase_scaler < 16);
+
+ if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
+ // 2 to 1
+ scaled = 1;
+
+ if (phase_scaler == 0) {
+ scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0c1);
+ scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+ scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+ } else {
+ const int buffer_stride = (dst_w + 3) & ~3;
+ const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scale_plane_2_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_2_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_2_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+ // 4 to 1
+ scaled = 1;
+ if (phase_scaler == 0) {
+ scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0c1);
+ scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+ scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
} else {
- vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+ const int buffer_stride = (dst_w + 1) & ~1;
+ const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
+ const int extra_padding = 16;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+ if (temp_buffer) {
+ scale_plane_4_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_4_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_4_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+ // 4 to 3
+ const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+ const int buffer_stride_ver = (dst_w + 7) & ~7;
+ const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ // When the vertical filter reads more pixels than the horizontal filter
+ // generated in each row, we need extra padding to avoid heap read overflow.
+ // For example, the horizontal filter generates 18 pixels but the vertical
+ // filter reads 24 pixels in a row. The difference is multiplied by 2 since
+ // two rows are interlaced together in the optimization.
+ const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
+ ? 2 * (buffer_stride_ver - buffer_stride_hor)
+ : 0;
+ const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+ uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+ if (temp_buffer) {
+ scaled = 1;
+ scale_plane_4_to_3_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+ scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type], phase_scaler,
+ temp_buffer);
+ scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type], phase_scaler,
+ temp_buffer);
+ free(temp_buffer);
+ }
+ } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
+ // 1 to 2
+ uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
+ if (temp_buffer) {
+ scaled = 1;
+ scale_plane_1_to_2_phase_0(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
+ src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+ scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src_w / 2, src_h / 2,
+ vp9_filter_kernels[filter_type][8],
+ temp_buffer);
+ scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src_w / 2, src_h / 2,
+ vp9_filter_kernels[filter_type][8],
+ temp_buffer);
+ free(temp_buffer);
}
+ }
+
+ if (scaled) {
+ vpx_extend_frame_borders(dst);
} else {
+ // Call c version for all other scaling ratios.
vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
}
}
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index 4a2581a34..ca0ad4407 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <emmintrin.h>
#include <xmmintrin.h>
@@ -25,8 +26,12 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i zero;
__m128i thr;
int16_t nzflag;
+ __m128i eob;
+ __m128i round, quant, dequant;
(void)scan_ptr;
+ (void)skip_block;
+ assert(!skip_block);
coeff_ptr += n_coeffs;
iscan_ptr += n_coeffs;
@@ -35,40 +40,106 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
n_coeffs = -n_coeffs;
zero = _mm_setzero_si128();
- if (!skip_block) {
- __m128i eob;
- __m128i round, quant, dequant;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
{
- __m128i coeff0, coeff1;
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ }
- // Setup global values
- {
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- }
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+ coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+ store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+ store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
+ }
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- // Do DC and first 15 AC
- coeff0 = load_tran_low(coeff_ptr + n_coeffs);
- coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+ thr = _mm_srai_epi16(dequant, 1);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+ coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
// Reinsert signs
@@ -81,131 +152,51 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
+ } else {
+ store_zero_tran_low(qcoeff_ptr + n_coeffs);
+ store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
+ store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+ store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
}
- n_coeffs += 8 * 2;
}
- thr = _mm_srai_epi16(dequant, 1);
-
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
-
- coeff0 = load_tran_low(coeff_ptr + n_coeffs);
- coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
- if (nzflag) {
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
- store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
- store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
- } else {
- store_zero_tran_low(qcoeff_ptr + n_coeffs);
- store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
- store_zero_tran_low(dqcoeff_ptr + n_coeffs);
- store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
- }
- }
-
- if (nzflag) {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
+ if (nzflag) {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
}
+ n_coeffs += 8 * 2;
+ }
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- store_zero_tran_low(qcoeff_ptr + n_coeffs);
- store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
- store_zero_tran_low(dqcoeff_ptr + n_coeffs);
- store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
}
}
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 1b88863f6..5703aa3bb 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -22,8 +22,6 @@ SECTION .text
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
qcoeff, dqcoeff, dequant, \
eob, scan, iscan
- cmp dword skipm, 0
- jne .blank
; actual quantize loop - setup pointers, rounders, etc.
movifnidn coeffq, coeffmp
@@ -171,28 +169,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
pshuflw m7, m8, 0x1
pmaxsw m8, m7
pextrw r6, m8, 0
- mov [r2], r6
- RET
-
- ; skip-block, i.e. just write all zeroes
-.blank:
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
-
- lea r0q, [r0q+ncoeffq*2]
- lea r2q, [r2q+ncoeffq*2]
- neg ncoeffq
- pxor m7, m7
-.blank_loop:
- STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq
- STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8
- STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq
- STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8
- add ncoeffq, mmsize
- jl .blank_loop
- mov word [r3q], 0
+ mov [r2], r6w
RET
%endmacro
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index 25fc80a9a..881caae78 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -171,12 +171,17 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+ RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000);
RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+#if CONFIG_REALTIME_ONLY
+ RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+#endif
RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
if (extra_cfg->max_gf_interval > 0) {
@@ -187,6 +192,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
(MAX_LAG_BUFFERS - 1));
}
+ // For formation of valid ARF groups lag_in _frames should be 0 or greater
+ // than the max_gf_interval + 2
+ if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 &&
+ cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) {
+ ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)");
+ }
+
if (cfg->rc_resize_allowed == 1) {
RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
@@ -202,7 +214,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 &&
level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 &&
level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN &&
- level != LEVEL_MAX)
+ level != LEVEL_AUTO && level != LEVEL_MAX)
ERROR("target_level is invalid");
}
@@ -269,6 +281,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
if (extra_cfg->tuning == VP8_TUNE_SSIM)
ERROR("Option --tune=ssim is not currently supported in VP9.");
+#if !CONFIG_REALTIME_ONLY
if (cfg->g_pass == VPX_RC_LAST_PASS) {
const size_t packet_sz = sizeof(FIRSTPASS_STATS);
const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
@@ -320,6 +333,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
ERROR("rc_twopass_stats_in missing EOS stats packet");
}
}
+#endif // !CONFIG_REALTIME_ONLY
#if !CONFIG_VP9_HIGHBITDEPTH
if (cfg->g_profile > (unsigned int)PROFILE_1) {
@@ -425,10 +439,20 @@ static void config_target_level(VP9EncoderConfig *oxcf) {
oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63);
// Adjust minimum art-ref distance.
- if (oxcf->min_gf_interval <
- (int)vp9_level_defs[target_level_index].min_altref_distance)
+ // min_gf_interval should be no less than min_altref_distance + 1,
+ // as the encoder may produce bitstream with alt-ref distance being
+ // min_gf_interval - 1.
+ if (oxcf->min_gf_interval <=
+ (int)vp9_level_defs[target_level_index].min_altref_distance) {
oxcf->min_gf_interval =
- (int)vp9_level_defs[target_level_index].min_altref_distance;
+ (int)vp9_level_defs[target_level_index].min_altref_distance + 1;
+ // If oxcf->max_gf_interval == 0, it will be assigned with a default value
+ // in vp9_rc_set_gf_interval_range().
+ if (oxcf->max_gf_interval != 0) {
+ oxcf->max_gf_interval =
+ VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval);
+ }
+ }
// Adjust maximum column tiles.
if (vp9_level_defs[target_level_index].max_col_tiles <
@@ -503,6 +527,7 @@ static vpx_codec_err_t set_encoder_config(
oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+ oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity;
oxcf->auto_key =
cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
@@ -613,6 +638,7 @@ static vpx_codec_err_t set_encoder_config(
printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias);
printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+ printf("vbr_corpus_complexity: %d\n", oxcf->vbr_corpus_complexity);
printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
printf("Version: %d\n", oxcf->Version);
@@ -888,12 +914,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
-#if CONFIG_MULTITHREAD
- if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
- return VPX_CODEC_MEM_ERROR;
- }
-#endif
-
if (ctx->config.enc) {
// Update the reference to the config structure to an internal copy.
priv->cfg = *ctx->config.enc;
@@ -925,9 +945,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
free(ctx->cx_data);
vp9_remove_compressor(ctx->cpi);
-#if CONFIG_MULTITHREAD
- pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
vpx_free(ctx->buffer_pool);
vpx_free(ctx);
return VPX_CODEC_OK;
@@ -938,6 +955,10 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
unsigned long deadline) {
MODE new_mode = BEST;
+#if CONFIG_REALTIME_ONLY
+ (void)duration;
+ deadline = VPX_DL_REALTIME;
+#else
switch (ctx->cfg.g_pass) {
case VPX_RC_ONE_PASS:
if (deadline > 0) {
@@ -958,6 +979,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
case VPX_RC_FIRST_PASS: break;
case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break;
}
+#endif // CONFIG_REALTIME_ONLY
if (deadline == VPX_DL_REALTIME) {
ctx->oxcf.pass = 0;
@@ -1266,8 +1288,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
cx_data += size;
cx_data_sz -= size;
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) {
vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
int sl;
@@ -1288,7 +1309,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
}
#endif
-#endif
if (is_one_pass_cbr_svc(cpi) &&
(cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
// Encoded all spatial layers; exit loop.
@@ -1679,6 +1699,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
50, // rc_two_pass_vbrbias
0, // rc_two_pass_vbrmin_section
2000, // rc_two_pass_vbrmax_section
+ 0, // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr)
// keyframing settings (kf)
VPX_KF_AUTO, // g_kfmode
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 1da1794b7..657490f4b 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -47,9 +47,6 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
ctx->priv->init_flags = ctx->init_flags;
priv->si.sz = sizeof(priv->si);
priv->flushed = 0;
- // TODO(jzern): remnants of frame-level parallel decoding should be
- // removed. cf., https://bugs.chromium.org/p/webm/issues/detail?id=1395
- priv->frame_parallel_decode = 0;
if (ctx->config.dec) {
priv->cfg = *ctx->config.dec;
ctx->config.dec = &priv->cfg;
@@ -60,33 +57,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
}
static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
- if (ctx->frame_workers != NULL) {
- int i;
- // Shutdown all threads before reclaiming any memory. The frame-level
- // parallel decoder may access data from another worker.
- for (i = 0; i < ctx->num_frame_workers; ++i) {
- VPxWorker *const worker = &ctx->frame_workers[i];
- vpx_get_worker_interface()->end(worker);
- }
- for (i = 0; i < ctx->num_frame_workers; ++i) {
- VPxWorker *const worker = &ctx->frame_workers[i];
- FrameWorkerData *const frame_worker_data =
- (FrameWorkerData *)worker->data1;
- vp9_remove_common(&frame_worker_data->pbi->common);
-#if CONFIG_VP9_POSTPROC
- vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
-#endif
- vp9_decoder_remove(frame_worker_data->pbi);
- vpx_free(frame_worker_data->scratch_buffer);
-#if CONFIG_MULTITHREAD
- pthread_mutex_destroy(&frame_worker_data->stats_mutex);
- pthread_cond_destroy(&frame_worker_data->stats_cond);
-#endif
- vpx_free(frame_worker_data);
- }
-#if CONFIG_MULTITHREAD
- pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
+ if (ctx->pbi != NULL) {
+ vp9_decoder_remove(ctx->pbi);
}
if (ctx->buffer_pool) {
@@ -94,7 +66,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
}
- vpx_free(ctx->frame_workers);
vpx_free(ctx->buffer_pool);
vpx_free(ctx);
return VPX_CODEC_OK;
@@ -228,32 +199,26 @@ static vpx_codec_err_t update_error_state(
}
static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
- int i;
-
- for (i = 0; i < ctx->num_frame_workers; ++i) {
- VPxWorker *const worker = &ctx->frame_workers[i];
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
- VP9_COMMON *const cm = &frame_worker_data->pbi->common;
- BufferPool *const pool = cm->buffer_pool;
-
- cm->new_fb_idx = INVALID_IDX;
- cm->byte_alignment = ctx->byte_alignment;
- cm->skip_loop_filter = ctx->skip_loop_filter;
-
- if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
- pool->get_fb_cb = ctx->get_ext_fb_cb;
- pool->release_fb_cb = ctx->release_ext_fb_cb;
- pool->cb_priv = ctx->ext_priv;
- } else {
- pool->get_fb_cb = vp9_get_frame_buffer;
- pool->release_fb_cb = vp9_release_frame_buffer;
+ VP9_COMMON *const cm = &ctx->pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
- if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
- vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
- "Failed to initialize internal frame buffers");
+ cm->new_fb_idx = INVALID_IDX;
+ cm->byte_alignment = ctx->byte_alignment;
+ cm->skip_loop_filter = ctx->skip_loop_filter;
- pool->cb_priv = &pool->int_frame_buffers;
- }
+ if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+ pool->get_fb_cb = ctx->get_ext_fb_cb;
+ pool->release_fb_cb = ctx->release_ext_fb_cb;
+ pool->cb_priv = ctx->ext_priv;
+ } else {
+ pool->get_fb_cb = vp9_get_frame_buffer;
+ pool->release_fb_cb = vp9_release_frame_buffer;
+
+ if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to initialize internal frame buffers");
+
+ pool->cb_priv = &pool->int_frame_buffers;
}
}
@@ -270,124 +235,21 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
flags->noise_level = ctx->postproc_cfg.noise_level;
}
-static int frame_worker_hook(void *arg1, void *arg2) {
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
- const uint8_t *data = frame_worker_data->data;
- (void)arg2;
-
- frame_worker_data->result = vp9_receive_compressed_data(
- frame_worker_data->pbi, frame_worker_data->data_size, &data);
- frame_worker_data->data_end = data;
-
- if (frame_worker_data->pbi->frame_parallel_decode) {
- // In frame parallel decoding, a worker thread must successfully decode all
- // the compressed data.
- if (frame_worker_data->result != 0 ||
- frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
- VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
- BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
- // Signal all the other threads that are waiting for this frame.
- vp9_frameworker_lock_stats(worker);
- frame_worker_data->frame_context_ready = 1;
- lock_buffer_pool(pool);
- frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
- unlock_buffer_pool(pool);
- frame_worker_data->pbi->need_resync = 1;
- vp9_frameworker_signal_stats(worker);
- vp9_frameworker_unlock_stats(worker);
- return 0;
- }
- } else if (frame_worker_data->result != 0) {
- // Check decode result in serial decode.
- frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
- frame_worker_data->pbi->need_resync = 1;
- }
- return !frame_worker_data->result;
-}
-
static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
- int i;
- const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-
ctx->last_show_frame = -1;
- ctx->next_submit_worker_id = 0;
- ctx->last_submit_worker_id = 0;
- ctx->next_output_worker_id = 0;
- ctx->frame_cache_read = 0;
- ctx->frame_cache_write = 0;
- ctx->num_cache_frames = 0;
ctx->need_resync = 1;
- ctx->num_frame_workers =
- (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1;
- if (ctx->num_frame_workers > MAX_DECODE_THREADS)
- ctx->num_frame_workers = MAX_DECODE_THREADS;
- ctx->available_threads = ctx->num_frame_workers;
ctx->flushed = 0;
ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
-#if CONFIG_MULTITHREAD
- if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
- set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+ ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
+ if (ctx->pbi == NULL) {
+ set_error_detail(ctx, "Failed to allocate decoder");
return VPX_CODEC_MEM_ERROR;
}
-#endif
-
- ctx->frame_workers = (VPxWorker *)vpx_malloc(ctx->num_frame_workers *
- sizeof(*ctx->frame_workers));
- if (ctx->frame_workers == NULL) {
- set_error_detail(ctx, "Failed to allocate frame_workers");
- return VPX_CODEC_MEM_ERROR;
- }
-
- for (i = 0; i < ctx->num_frame_workers; ++i) {
- VPxWorker *const worker = &ctx->frame_workers[i];
- FrameWorkerData *frame_worker_data = NULL;
- winterface->init(worker);
- worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
- if (worker->data1 == NULL) {
- set_error_detail(ctx, "Failed to allocate frame_worker_data");
- return VPX_CODEC_MEM_ERROR;
- }
- frame_worker_data = (FrameWorkerData *)worker->data1;
- frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
- if (frame_worker_data->pbi == NULL) {
- set_error_detail(ctx, "Failed to allocate frame_worker_data");
- return VPX_CODEC_MEM_ERROR;
- }
- frame_worker_data->pbi->frame_worker_owner = worker;
- frame_worker_data->worker_id = i;
- frame_worker_data->scratch_buffer = NULL;
- frame_worker_data->scratch_buffer_size = 0;
- frame_worker_data->frame_context_ready = 0;
- frame_worker_data->received_frame = 0;
-#if CONFIG_MULTITHREAD
- if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
- set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
- return VPX_CODEC_MEM_ERROR;
- }
-
- if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
- set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
- return VPX_CODEC_MEM_ERROR;
- }
-#endif
- // If decoding in serial mode, FrameWorker thread could create tile worker
- // thread or loopfilter thread.
- frame_worker_data->pbi->max_threads =
- (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
-
- frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
- frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
- frame_worker_data->pbi->common.frame_parallel_decode =
- ctx->frame_parallel_decode;
- worker->hook = (VPxWorkerHook)frame_worker_hook;
- if (!winterface->reset(worker)) {
- set_error_detail(ctx, "Frame Worker thread creation failed");
- return VPX_CODEC_MEM_ERROR;
- }
- }
+ ctx->pbi->max_threads = ctx->cfg.threads;
+ ctx->pbi->inv_tile_order = ctx->invert_tile_order;
// If postprocessing was enabled by the application and a
// configuration has not been provided, default it.
@@ -401,7 +263,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
const VP9Decoder *const pbi) {
- // Clear resync flag if worker got a key frame or intra only frame.
+ // Clear resync flag if the decoder got a key frame or intra only frame.
if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
(pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
ctx->need_resync = 0;
@@ -410,7 +272,6 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
const uint8_t **data, unsigned int data_sz,
void *user_priv, int64_t deadline) {
- const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
(void)deadline;
// Determine the stream parameters. Note that we rely on peek_si to
@@ -426,101 +287,23 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR;
}
- if (!ctx->frame_parallel_decode) {
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
- frame_worker_data->data = *data;
- frame_worker_data->data_size = data_sz;
- frame_worker_data->user_priv = user_priv;
- frame_worker_data->received_frame = 1;
-
- // Set these even if already initialized. The caller may have changed the
- // decrypt config between frames.
- frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
- frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
-
- worker->had_error = 0;
- winterface->execute(worker);
+ ctx->user_priv = user_priv;
- // Update data pointer after decode.
- *data = frame_worker_data->data_end;
+ // Set these even if already initialized. The caller may have changed the
+ // decrypt config between frames.
+ ctx->pbi->decrypt_cb = ctx->decrypt_cb;
+ ctx->pbi->decrypt_state = ctx->decrypt_state;
- if (worker->had_error)
- return update_error_state(ctx, &frame_worker_data->pbi->common.error);
-
- check_resync(ctx, frame_worker_data->pbi);
- } else {
- VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
- // Copy context from last worker thread to next worker thread.
- if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
- vp9_frameworker_copy_context(
- &ctx->frame_workers[ctx->next_submit_worker_id],
- &ctx->frame_workers[ctx->last_submit_worker_id]);
-
- frame_worker_data->pbi->ready_for_new_data = 0;
- // Copy the compressed data into worker's internal buffer.
- // TODO(hkuang): Will all the workers allocate the same size
- // as the size of the first intra frame be better? This will
- // avoid too many deallocate and allocate.
- if (frame_worker_data->scratch_buffer_size < data_sz) {
- vpx_free(frame_worker_data->scratch_buffer);
- frame_worker_data->scratch_buffer = (uint8_t *)vpx_malloc(data_sz);
- if (frame_worker_data->scratch_buffer == NULL) {
- set_error_detail(ctx, "Failed to reallocate scratch buffer");
- return VPX_CODEC_MEM_ERROR;
- }
- frame_worker_data->scratch_buffer_size = data_sz;
- }
- frame_worker_data->data_size = data_sz;
- memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
-
- frame_worker_data->frame_decoded = 0;
- frame_worker_data->frame_context_ready = 0;
- frame_worker_data->received_frame = 1;
- frame_worker_data->data = frame_worker_data->scratch_buffer;
- frame_worker_data->user_priv = user_priv;
-
- if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
- ctx->last_submit_worker_id =
- (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
-
- ctx->next_submit_worker_id =
- (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
- --ctx->available_threads;
- worker->had_error = 0;
- winterface->launch(worker);
+ if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) {
+ ctx->pbi->cur_buf->buf.corrupted = 1;
+ ctx->pbi->need_resync = 1;
+ ctx->need_resync = 1;
+ return update_error_state(ctx, &ctx->pbi->common.error);
}
- return VPX_CODEC_OK;
-}
+ check_resync(ctx, ctx->pbi);
-static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
- YV12_BUFFER_CONFIG sd;
- vp9_ppflags_t flags = { 0, 0, 0 };
- const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
- VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
- ctx->next_output_worker_id =
- (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
- // TODO(hkuang): Add worker error handling here.
- winterface->sync(worker);
- frame_worker_data->received_frame = 0;
- ++ctx->available_threads;
-
- check_resync(ctx, frame_worker_data->pbi);
-
- if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
- VP9_COMMON *const cm = &frame_worker_data->pbi->common;
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
- ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
- yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
- frame_worker_data->user_priv);
- ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
- frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
- ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
- ++ctx->num_cache_frames;
- }
+ return VPX_CODEC_OK;
}
static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
@@ -540,8 +323,8 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
// Reset flushed when receiving a valid frame.
ctx->flushed = 0;
- // Initialize the decoder workers on the first frame.
- if (ctx->frame_workers == NULL) {
+ // Initialize the decoder on the first frame.
+ if (ctx->pbi == NULL) {
const vpx_codec_err_t res = init_decoder(ctx);
if (res != VPX_CODEC_OK) return res;
}
@@ -553,91 +336,37 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1)
frame_count = ctx->svc_spatial_layer + 1;
- if (ctx->frame_parallel_decode) {
- // Decode in frame parallel mode. When decoding in this mode, the frame
- // passed to the decoder must be either a normal frame or a superframe with
- // superframe index so the decoder could get each frame's start position
- // in the superframe.
- if (frame_count > 0) {
- int i;
-
- for (i = 0; i < frame_count; ++i) {
- const uint8_t *data_start_copy = data_start;
- const uint32_t frame_size = frame_sizes[i];
- if (data_start < data ||
- frame_size > (uint32_t)(data_end - data_start)) {
- set_error_detail(ctx, "Invalid frame size in index");
- return VPX_CODEC_CORRUPT_FRAME;
- }
-
- if (ctx->available_threads == 0) {
- // No more threads for decoding. Wait until the next output worker
- // finishes decoding. Then copy the decoded frame into cache.
- if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
- wait_worker_and_cache_frame(ctx);
- } else {
- // TODO(hkuang): Add unit test to test this path.
- set_error_detail(ctx, "Frame output cache is full.");
- return VPX_CODEC_ERROR;
- }
- }
+ // Decode in serial mode.
+ if (frame_count > 0) {
+ int i;
- res =
- decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
- if (res != VPX_CODEC_OK) return res;
- data_start += frame_size;
- }
- } else {
- if (ctx->available_threads == 0) {
- // No more threads for decoding. Wait until the next output worker
- // finishes decoding. Then copy the decoded frame into cache.
- if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
- wait_worker_and_cache_frame(ctx);
- } else {
- // TODO(hkuang): Add unit test to test this path.
- set_error_detail(ctx, "Frame output cache is full.");
- return VPX_CODEC_ERROR;
- }
+ for (i = 0; i < frame_count; ++i) {
+ const uint8_t *data_start_copy = data_start;
+ const uint32_t frame_size = frame_sizes[i];
+ vpx_codec_err_t res;
+ if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
+ set_error_detail(ctx, "Invalid frame size in index");
+ return VPX_CODEC_CORRUPT_FRAME;
}
- res = decode_one(ctx, &data, data_sz, user_priv, deadline);
+ res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
if (res != VPX_CODEC_OK) return res;
+
+ data_start += frame_size;
}
} else {
- // Decode in serial mode.
- if (frame_count > 0) {
- int i;
-
- for (i = 0; i < frame_count; ++i) {
- const uint8_t *data_start_copy = data_start;
- const uint32_t frame_size = frame_sizes[i];
- vpx_codec_err_t res;
- if (data_start < data ||
- frame_size > (uint32_t)(data_end - data_start)) {
- set_error_detail(ctx, "Invalid frame size in index");
- return VPX_CODEC_CORRUPT_FRAME;
- }
-
- res =
- decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
- if (res != VPX_CODEC_OK) return res;
+ while (data_start < data_end) {
+ const uint32_t frame_size = (uint32_t)(data_end - data_start);
+ const vpx_codec_err_t res =
+ decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+ if (res != VPX_CODEC_OK) return res;
- data_start += frame_size;
- }
- } else {
+ // Account for suboptimal termination by the encoder.
while (data_start < data_end) {
- const uint32_t frame_size = (uint32_t)(data_end - data_start);
- const vpx_codec_err_t res =
- decode_one(ctx, &data_start, frame_size, user_priv, deadline);
- if (res != VPX_CODEC_OK) return res;
-
- // Account for suboptimal termination by the encoder.
- while (data_start < data_end) {
- const uint8_t marker =
- read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
- if (marker) break;
- ++data_start;
- }
+ const uint8_t marker =
+ read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
+ if (marker) break;
+ ++data_start;
}
}
}
@@ -645,80 +374,28 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
return res;
}
-static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
- RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
- // Decrease reference count of last output frame in frame parallel mode.
- if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
- BufferPool *const pool = ctx->buffer_pool;
- lock_buffer_pool(pool);
- decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
- unlock_buffer_pool(pool);
- }
-}
-
static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
vpx_codec_iter_t *iter) {
vpx_image_t *img = NULL;
- // Only return frame when all the cpu are busy or
- // application fluhsed the decoder in frame parallel decode.
- if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
- !ctx->flushed) {
- return NULL;
- }
+ // Legacy parameter carried over from VP8. Has no effect for VP9 since we
+ // always return only 1 frame per decode call.
+ (void)iter;
- // Output the frames in the cache first.
- if (ctx->num_cache_frames > 0) {
- release_last_output_frame(ctx);
- ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
- if (ctx->need_resync) return NULL;
- img = &ctx->frame_cache[ctx->frame_cache_read].img;
- ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
- --ctx->num_cache_frames;
- return img;
- }
-
- // iter acts as a flip flop, so an image is only returned on the first
- // call to get_frame.
- if (*iter == NULL && ctx->frame_workers != NULL) {
- do {
- YV12_BUFFER_CONFIG sd;
- vp9_ppflags_t flags = { 0, 0, 0 };
- const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
- VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
- FrameWorkerData *const frame_worker_data =
- (FrameWorkerData *)worker->data1;
- ctx->next_output_worker_id =
- (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
- if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
- set_ppflags(ctx, &flags);
- // Wait for the frame from worker thread.
- if (winterface->sync(worker)) {
- // Check if worker has received any frames.
- if (frame_worker_data->received_frame == 1) {
- ++ctx->available_threads;
- frame_worker_data->received_frame = 0;
- check_resync(ctx, frame_worker_data->pbi);
- }
- if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
- VP9_COMMON *const cm = &frame_worker_data->pbi->common;
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
- release_last_output_frame(ctx);
- ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
- if (ctx->need_resync) return NULL;
- yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
- ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
- img = &ctx->img;
- return img;
- }
- } else {
- // Decoding failed. Release the worker thread.
- frame_worker_data->received_frame = 0;
- ++ctx->available_threads;
- ctx->need_resync = 1;
- if (ctx->flushed != 1) return NULL;
- }
- } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+ if (ctx->pbi != NULL) {
+ YV12_BUFFER_CONFIG sd;
+ vp9_ppflags_t flags = { 0, 0, 0 };
+ if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags);
+ if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+ VP9_COMMON *const cm = &ctx->pbi->common;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ ctx->last_show_frame = ctx->pbi->common.new_fb_idx;
+ if (ctx->need_resync) return NULL;
+ yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+ ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+ img = &ctx->img;
+ return img;
+ }
}
return NULL;
}
@@ -728,7 +405,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
if (cb_get == NULL || cb_release == NULL) {
return VPX_CODEC_INVALID_PARAM;
- } else if (ctx->frame_workers == NULL) {
+ } else if (ctx->pbi == NULL) {
// If the decoder has already been initialized, do not accept changes to
// the frame buffer functions.
ctx->get_ext_fb_cb = cb_get;
@@ -744,21 +421,12 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
va_list args) {
vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
- // Only support this function in serial decode.
- if (ctx->frame_parallel_decode) {
- set_error_detail(ctx, "Not supported in frame parallel decode");
- return VPX_CODEC_INCAPABLE;
- }
-
if (data) {
vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
YV12_BUFFER_CONFIG sd;
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
- return vp9_set_reference_dec(&frame_worker_data->pbi->common,
- ref_frame_to_vp9_reframe(frame->frame_type),
- &sd);
+ return vp9_set_reference_dec(
+ &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd);
} else {
return VPX_CODEC_INVALID_PARAM;
}
@@ -768,20 +436,12 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
va_list args) {
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
- // Only support this function in serial decode.
- if (ctx->frame_parallel_decode) {
- set_error_detail(ctx, "Not supported in frame parallel decode");
- return VPX_CODEC_INCAPABLE;
- }
-
if (data) {
vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
YV12_BUFFER_CONFIG sd;
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
- return vp9_copy_reference_dec(frame_worker_data->pbi,
- (VP9_REFFRAME)frame->frame_type, &sd);
+ return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type,
+ &sd);
} else {
return VPX_CODEC_INVALID_PARAM;
}
@@ -791,17 +451,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
va_list args) {
vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
- // Only support this function in serial decode.
- if (ctx->frame_parallel_decode) {
- set_error_detail(ctx, "Not supported in frame parallel decode");
- return VPX_CODEC_INCAPABLE;
- }
-
if (data) {
YV12_BUFFER_CONFIG *fb;
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
- fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+ fb = get_ref_frame(&ctx->pbi->common, data->idx);
if (fb == NULL) return VPX_CODEC_ERROR;
yuvconfig2image(&data->img, fb, NULL);
return VPX_CODEC_OK;
@@ -832,9 +484,8 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *const arg = va_arg(args, int *);
- if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
- *arg =
- ((FrameWorkerData *)ctx->frame_workers[0].data1)->pbi->common.base_qindex;
+ if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM;
+ *arg = ctx->pbi->common.base_qindex;
return VPX_CODEC_OK;
}
@@ -842,18 +493,9 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *const update_info = va_arg(args, int *);
- // Only support this function in serial decode.
- if (ctx->frame_parallel_decode) {
- set_error_detail(ctx, "Not supported in frame parallel decode");
- return VPX_CODEC_INCAPABLE;
- }
-
if (update_info) {
- if (ctx->frame_workers) {
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data =
- (FrameWorkerData *)worker->data1;
- *update_info = frame_worker_data->pbi->refresh_frame_flags;
+ if (ctx->pbi != NULL) {
+ *update_info = ctx->pbi->refresh_frame_flags;
return VPX_CODEC_OK;
} else {
return VPX_CODEC_ERROR;
@@ -868,14 +510,9 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
int *corrupted = va_arg(args, int *);
if (corrupted) {
- if (ctx->frame_workers) {
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data =
- (FrameWorkerData *)worker->data1;
- RefCntBuffer *const frame_bufs =
- frame_worker_data->pbi->common.buffer_pool->frame_bufs;
- if (frame_worker_data->pbi->common.frame_to_show == NULL)
- return VPX_CODEC_ERROR;
+ if (ctx->pbi != NULL) {
+ RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs;
+ if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR;
if (ctx->last_show_frame >= 0)
*corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
return VPX_CODEC_OK;
@@ -891,18 +528,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *const frame_size = va_arg(args, int *);
- // Only support this function in serial decode.
- if (ctx->frame_parallel_decode) {
- set_error_detail(ctx, "Not supported in frame parallel decode");
- return VPX_CODEC_INCAPABLE;
- }
-
if (frame_size) {
- if (ctx->frame_workers) {
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data =
- (FrameWorkerData *)worker->data1;
- const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+ if (ctx->pbi != NULL) {
+ const VP9_COMMON *const cm = &ctx->pbi->common;
frame_size[0] = cm->width;
frame_size[1] = cm->height;
return VPX_CODEC_OK;
@@ -918,18 +546,9 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *const render_size = va_arg(args, int *);
- // Only support this function in serial decode.
- if (ctx->frame_parallel_decode) {
- set_error_detail(ctx, "Not supported in frame parallel decode");
- return VPX_CODEC_INCAPABLE;
- }
-
if (render_size) {
- if (ctx->frame_workers) {
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data =
- (FrameWorkerData *)worker->data1;
- const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+ if (ctx->pbi != NULL) {
+ const VP9_COMMON *const cm = &ctx->pbi->common;
render_size[0] = cm->render_width;
render_size[1] = cm->render_height;
return VPX_CODEC_OK;
@@ -944,13 +563,10 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
va_list args) {
unsigned int *const bit_depth = va_arg(args, unsigned int *);
- VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
if (bit_depth) {
- if (worker) {
- FrameWorkerData *const frame_worker_data =
- (FrameWorkerData *)worker->data1;
- const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+ if (ctx->pbi != NULL) {
+ const VP9_COMMON *const cm = &ctx->pbi->common;
*bit_depth = cm->bit_depth;
return VPX_CODEC_OK;
} else {
@@ -989,10 +605,8 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_INVALID_PARAM;
ctx->byte_alignment = byte_alignment;
- if (ctx->frame_workers) {
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
- frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+ if (ctx->pbi != NULL) {
+ ctx->pbi->common.byte_alignment = byte_alignment;
}
return VPX_CODEC_OK;
}
@@ -1001,10 +615,8 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
va_list args) {
ctx->skip_loop_filter = va_arg(args, int);
- if (ctx->frame_workers) {
- VPxWorker *const worker = ctx->frame_workers;
- FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
- frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+ if (ctx->pbi != NULL) {
+ ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
}
return VPX_CODEC_OK;
diff --git a/libvpx/vp9/vp9_dx_iface.h b/libvpx/vp9/vp9_dx_iface.h
index c1559599b..18bc7ab0d 100644
--- a/libvpx/vp9/vp9_dx_iface.h
+++ b/libvpx/vp9/vp9_dx_iface.h
@@ -15,19 +15,12 @@
typedef vpx_codec_stream_info_t vp9_stream_info_t;
-// This limit is due to framebuffer numbers.
-// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
-#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames.
-
-typedef struct cache_frame {
- int fb_idx;
- vpx_image_t img;
-} cache_frame;
-
struct vpx_codec_alg_priv {
vpx_codec_priv_t base;
vpx_codec_dec_cfg_t cfg;
vp9_stream_info_t si;
+ VP9Decoder *pbi;
+ void *user_priv;
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
vpx_decrypt_cb decrypt_cb;
@@ -40,20 +33,8 @@ struct vpx_codec_alg_priv {
int byte_alignment;
int skip_loop_filter;
- // Frame parallel related.
- int frame_parallel_decode; // frame-based threading.
- VPxWorker *frame_workers;
- int num_frame_workers;
- int next_submit_worker_id;
- int last_submit_worker_id;
- int next_output_worker_id;
- int available_threads;
- cache_frame frame_cache[FRAME_CACHE_SIZE];
- int frame_cache_write;
- int frame_cache_read;
- int num_cache_frames;
int need_resync; // wait for key/intra-only frame
- // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+ // BufferPool that holds all reference frames.
BufferPool *buffer_pool;
// External frame buffer info to save for VP9 common.
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 47846c941..d633ed142 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -130,6 +130,7 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
endif
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@@ -138,4 +139,10 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+# Strip unnecessary files with CONFIG_REALTIME_ONLY
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
+
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index 4c6fd0071..59f612b94 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -24,8 +24,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
VP9_DX_SRCS-yes += decoder/vp9_decoder.c
VP9_DX_SRCS-yes += decoder/vp9_decoder.h
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
diff --git a/libvpx/vpx/src/svc_encodeframe.c b/libvpx/vpx/src/svc_encodeframe.c
index c774abb34..f633600c7 100644
--- a/libvpx/vpx/src/svc_encodeframe.c
+++ b/libvpx/vpx/src/svc_encodeframe.c
@@ -131,9 +131,9 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input,
int *value0, int *value1) {
if (type == SCALE_FACTOR) {
- *value0 = strtol(input, &input, 10);
+ *value0 = (int)strtol(input, &input, 10);
if (*input++ != '/') return VPX_CODEC_INVALID_PARAM;
- *value1 = strtol(input, &input, 10);
+ *value1 = (int)strtol(input, &input, 10);
if (*value0 < option_min_values[SCALE_FACTOR] ||
*value1 < option_min_values[SCALE_FACTOR] ||
@@ -559,8 +559,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
iter = NULL;
while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
switch (cx_pkt->kind) {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
int i;
for (i = 0; i < svc_ctx->spatial_layers; ++i) {
@@ -595,9 +594,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
break;
}
#endif
-#endif
case VPX_CODEC_PSNR_PKT: {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
int j;
svc_log(svc_ctx, SVC_LOG_DEBUG,
"frame: %d, layer: %d, PSNR(Total/Y/U/V): "
diff --git a/libvpx/vpx/src/vpx_encoder.c b/libvpx/vpx/src/vpx_encoder.c
index 4390cf7c8..1cf2dca69 100644
--- a/libvpx/vpx/src/vpx_encoder.c
+++ b/libvpx/vpx/src/vpx_encoder.c
@@ -12,8 +12,11 @@
* \brief Provides the high level interface to wrap encoder algorithms.
*
*/
+#include <assert.h>
#include <limits.h>
+#include <stdlib.h>
#include <string.h>
+#include "vp8/common/blockd.h"
#include "vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
@@ -81,6 +84,8 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
int i;
void *mem_loc = NULL;
+ if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE;
+
if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
for (i = 0; i < num_enc; i++) {
vpx_codec_priv_enc_mr_cfg_t mr_cfg;
@@ -89,28 +94,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
dsf->den > dsf->num) {
res = VPX_CODEC_INVALID_PARAM;
- break;
+ } else {
+ mr_cfg.mr_low_res_mode_info = mem_loc;
+ mr_cfg.mr_total_resolutions = num_enc;
+ mr_cfg.mr_encoder_id = num_enc - 1 - i;
+ mr_cfg.mr_down_sampling_factor.num = dsf->num;
+ mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+ /* Force Key-frame synchronization. Namely, encoder at higher
+ * resolution always use the same frame_type chosen by the
+ * lowest-resolution encoder.
+ */
+ if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
+
+ ctx->iface = iface;
+ ctx->name = iface->name;
+ ctx->priv = NULL;
+ ctx->init_flags = flags;
+ ctx->config.enc = cfg;
+ res = ctx->iface->init(ctx, &mr_cfg);
}
- mr_cfg.mr_low_res_mode_info = mem_loc;
- mr_cfg.mr_total_resolutions = num_enc;
- mr_cfg.mr_encoder_id = num_enc - 1 - i;
- mr_cfg.mr_down_sampling_factor.num = dsf->num;
- mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
- /* Force Key-frame synchronization. Namely, encoder at higher
- * resolution always use the same frame_type chosen by the
- * lowest-resolution encoder.
- */
- if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
-
- ctx->iface = iface;
- ctx->name = iface->name;
- ctx->priv = NULL;
- ctx->init_flags = flags;
- ctx->config.enc = cfg;
- res = ctx->iface->init(ctx, &mr_cfg);
-
if (res) {
const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
/* Destroy current ctx */
@@ -124,10 +128,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
vpx_codec_destroy(ctx);
i--;
}
+#if CONFIG_MULTI_RES_ENCODING
+ assert(mem_loc);
+ free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
+ free(mem_loc);
+#endif
+ return SAVE_STATUS(ctx, res);
}
- if (res) break;
-
ctx++;
cfg++;
dsf++;
diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h
index ee6be4a24..c21b8b60d 100644
--- a/libvpx/vpx/vp8cx.h
+++ b/libvpx/vpx/vp8cx.h
@@ -333,11 +333,12 @@ enum vp8e_enc_control_id {
* 2 = 4 tile columns
* .....
* n = 2**n tile columns
- * The requested tile columns will be capped by encoder based on image size
- * limitation (The minimum width of a tile column is 256 pixel, the maximum
- * is 4096).
+ * The requested tile columns will be capped by the encoder based on image
+ * size limitations (The minimum width of a tile column is 256 pixels, the
+ * maximum is 4096).
*
- * By default, the value is 0, i.e. one single column tile for entire image.
+ * By default, the value is 6, i.e., the maximum number of tiles supported by
+ * the resolution.
*
* Supported in codecs: VP9
*/
@@ -368,10 +369,10 @@ enum vp8e_enc_control_id {
* VP9 has a bitstream feature to reduce decoding dependency between frames
* by turning off backward update of probability context used in encoding
* and decoding. This allows staged parallel processing of more than one
- * video frames in the decoder. This control function provides a mean to
+ * video frame in the decoder. This control function provides a means to
* turn this feature on or off for bitstreams produced by encoder.
*
- * By default, this feature is off.
+ * By default, this feature is on.
*
* Supported in codecs: VP9
*/
@@ -407,7 +408,7 @@ enum vp8e_enc_control_id {
/*!\brief Codec control function to set noise sensitivity.
*
- * 0: off, 1: On(YOnly)
+ * 0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly)
*
* Supported in codecs: VP9
*/
@@ -443,6 +444,7 @@ enum vp8e_enc_control_id {
* \note Valid parameter range:
* VP9E_CONTENT_DEFAULT = Regular video content (Default)
* VP9E_CONTENT_SCREEN = Screen capture content
+ * VP9E_CONTENT_FILM = Film content: improves grain retention
*
* Supported in codecs: VP9
*/
@@ -695,6 +697,7 @@ typedef enum {
typedef enum {
VP9E_CONTENT_DEFAULT,
VP9E_CONTENT_SCREEN,
+ VP9E_CONTENT_FILM,
VP9E_CONTENT_INVALID
} vp9e_tune_content;
diff --git a/libvpx/vpx/vp8dx.h b/libvpx/vpx/vp8dx.h
index 41c53e48d..398c67022 100644
--- a/libvpx/vpx/vp8dx.h
+++ b/libvpx/vpx/vp8dx.h
@@ -179,6 +179,8 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
#define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
/*!\endcond */
/*! @} - end defgroup vp8_decoder */
diff --git a/libvpx/vpx/vpx_codec.h b/libvpx/vpx/vpx_codec.h
index e91cd9e0d..ad05f4c74 100644
--- a/libvpx/vpx/vpx_codec.h
+++ b/libvpx/vpx/vpx_codec.h
@@ -46,34 +46,35 @@ extern "C" {
#include "./vpx_integer.h"
/*!\brief Decorator indicating a function is deprecated */
-#ifndef DEPRECATED
+#ifndef VPX_DEPRECATED
#if defined(__GNUC__) && __GNUC__
-#define DEPRECATED __attribute__((deprecated))
+#define VPX_DEPRECATED __attribute__((deprecated))
#elif defined(_MSC_VER)
-#define DEPRECATED
+#define VPX_DEPRECATED
#else
-#define DEPRECATED
+#define VPX_DEPRECATED
#endif
-#endif /* DEPRECATED */
+#endif /* VPX_DEPRECATED */
-#ifndef DECLSPEC_DEPRECATED
+#ifndef VPX_DECLSPEC_DEPRECATED
#if defined(__GNUC__) && __GNUC__
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
#elif defined(_MSC_VER)
-/*!\brief \copydoc #DEPRECATED */
-#define DECLSPEC_DEPRECATED __declspec(deprecated)
+/*!\brief \copydoc #VPX_DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated)
#else
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
#endif
-#endif /* DECLSPEC_DEPRECATED */
+#endif /* VPX_DECLSPEC_DEPRECATED */
/*!\brief Decorator indicating a function is potentially unused */
-#ifdef UNUSED
-#elif defined(__GNUC__) || defined(__clang__)
-#define UNUSED __attribute__((unused))
+#ifndef VPX_UNUSED
+#if defined(__GNUC__) || defined(__clang__)
+#define VPX_UNUSED __attribute__((unused))
#else
-#define UNUSED
+#define VPX_UNUSED
#endif
+#endif /* VPX_UNUSED */
/*!\brief Current ABI version number
*
@@ -413,7 +414,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
*/
#define VPX_CTRL_USE_TYPE(id, typ) \
static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \
- UNUSED; \
+ VPX_UNUSED; \
\
static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \
int ctrl_id, typ data) { \
@@ -430,13 +431,13 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
* It defines a static function with the correctly typed arguments as a
* wrapper to the type-unsafe internal function.
*/
-#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
- DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
- vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED; \
- \
- DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
- vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \
- return vpx_codec_control_(ctx, ctrl_id, data); \
+#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
+ VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+ vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED; \
+ \
+ VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+ vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \
+ return vpx_codec_control_(ctx, ctrl_id, data); \
} /**<\hideinitializer*/
/*!\brief vpx_codec_control void type definition macro
@@ -451,7 +452,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
*/
#define VPX_CTRL_VOID(id) \
static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \
- UNUSED; \
+ VPX_UNUSED; \
\
static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \
int ctrl_id) { \
diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h
index c915ed671..464bc408c 100644
--- a/libvpx/vpx/vpx_encoder.h
+++ b/libvpx/vpx/vpx_encoder.h
@@ -63,7 +63,7 @@ extern "C" {
* fields to structures
*/
#define VPX_ENCODER_ABI_VERSION \
- (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+ (6 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
*
@@ -154,9 +154,8 @@ enum vpx_codec_cx_pkt_kind {
VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */
VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */
-// Spatial SVC is still experimental and may be removed before the next ABI
-// bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+// Spatial SVC is still experimental and may be removed.
+#if defined(VPX_TEST_SPATIAL_SVC)
VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
#endif
@@ -192,9 +191,8 @@ typedef struct vpx_codec_cx_pkt {
double psnr[4]; /**< PSNR, total/y/u/v */
} psnr; /**< data for PSNR packet */
vpx_fixed_buf_t raw; /**< data for arbitrary packets */
-// Spatial SVC is still experimental and may be removed before the next
-// ABI bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+// Spatial SVC is still experimental and may be removed.
+#if defined(VPX_TEST_SPATIAL_SVC)
size_t layer_sizes[VPX_SS_MAX_LAYERS];
struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
#endif
@@ -508,25 +506,31 @@ typedef struct vpx_codec_enc_cfg {
/*!\brief Rate control adaptation undershoot control
*
- * This value, expressed as a percentage of the target bitrate,
+ * VP8: Expressed as a percentage of the target bitrate,
* controls the maximum allowed adaptation speed of the codec.
* This factor controls the maximum amount of bits that can
* be subtracted from the target bitrate in order to compensate
* for prior overshoot.
- *
- * Valid values in the range 0-1000.
+ * VP9: Expressed as a percentage of the target bitrate, a threshold
+ * undershoot level (current rate vs target) beyond which more agressive
+ * corrective measures are taken.
+ * *
+ * Valid values in the range VP8:0-1000 VP9: 0-100.
*/
unsigned int rc_undershoot_pct;
/*!\brief Rate control adaptation overshoot control
*
- * This value, expressed as a percentage of the target bitrate,
+ * VP8: Expressed as a percentage of the target bitrate,
* controls the maximum allowed adaptation speed of the codec.
* This factor controls the maximum amount of bits that can
* be added to the target bitrate in order to compensate for
* prior undershoot.
+ * VP9: Expressed as a percentage of the target bitrate, a threshold
+ * overshoot level (current rate vs target) beyond which more agressive
+ * corrective measures are taken.
*
- * Valid values in the range 0-1000.
+ * Valid values in the range VP8:0-1000 VP9: 0-100.
*/
unsigned int rc_overshoot_pct;
@@ -591,6 +595,13 @@ typedef struct vpx_codec_enc_cfg {
*/
unsigned int rc_2pass_vbr_maxsection_pct;
+ /*!\brief Two-pass corpus vbr mode complexity control
+ * Used only in VP9: A value representing the corpus midpoint complexity
+ * for corpus vbr mode. This value defaults to 0 which disables corpus vbr
+ * mode in favour of normal vbr mode.
+ */
+ unsigned int rc_2pass_vbr_corpus_complexity;
+
/*
* keyframing settings (kf)
*/
diff --git a/libvpx/vpx_dsp/add_noise.c b/libvpx/vpx_dsp/add_noise.c
index a2b4c9010..cda6ae881 100644
--- a/libvpx/vpx_dsp/add_noise.c
+++ b/libvpx/vpx_dsp/add_noise.c
@@ -15,6 +15,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/postproc.h"
#include "vpx_ports/mem.h"
void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
diff --git a/libvpx/vpx_dsp/arm/avg_neon.c b/libvpx/vpx_dsp/arm/avg_neon.c
index 257e8ffee..fa7dd0960 100644
--- a/libvpx/vpx_dsp/arm/avg_neon.c
+++ b/libvpx/vpx_dsp/arm/avg_neon.c
@@ -17,51 +17,35 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
- const uint32x4_t a = vpaddlq_u16(v_16x8);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
+uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
+ const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+ const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+ const uint32x2_t d = horizontal_add_uint16x8(c);
+ return vget_lane_u32(vrshr_n_u32(d, 4), 0);
}
-unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
- uint16x8_t v_sum;
- uint32x2_t v_s0 = vdup_n_u32(0);
- uint32x2_t v_s1 = vdup_n_u32(0);
- v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
- v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
- v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
- v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
- v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
- return (horizontal_add_u16x8(v_sum) + 8) >> 4;
-}
-
-unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
- uint8x8_t v_s0 = vld1_u8(s);
- const uint8x8_t v_s1 = vld1_u8(s + p);
- uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
-
- v_s0 = vld1_u8(s + 2 * p);
- v_sum = vaddw_u8(v_sum, v_s0);
-
- v_s0 = vld1_u8(s + 3 * p);
- v_sum = vaddw_u8(v_sum, v_s0);
-
- v_s0 = vld1_u8(s + 4 * p);
- v_sum = vaddw_u8(v_sum, v_s0);
-
- v_s0 = vld1_u8(s + 5 * p);
- v_sum = vaddw_u8(v_sum, v_s0);
-
- v_s0 = vld1_u8(s + 6 * p);
- v_sum = vaddw_u8(v_sum, v_s0);
+uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
+ int i;
+ uint8x8_t b, c;
+ uint16x8_t sum;
+ uint32x2_t d;
+ b = vld1_u8(a);
+ a += a_stride;
+ c = vld1_u8(a);
+ a += a_stride;
+ sum = vaddl_u8(b, c);
+
+ for (i = 0; i < 6; ++i) {
+ const uint8x8_t d = vld1_u8(a);
+ a += a_stride;
+ sum = vaddw_u8(sum, d);
+ }
- v_s0 = vld1_u8(s + 7 * p);
- v_sum = vaddw_u8(v_sum, v_s0);
+ d = horizontal_add_uint16x8(sum);
- return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+ return vget_lane_u32(vrshr_n_u32(d, 6), 0);
}
// coeff: 16 bits, dynamic range [-32640, 32640].
@@ -155,7 +139,8 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
ref += 16;
}
- return horizontal_add_u16x8(vec_sum);
+ return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)),
+ 0);
}
// ref, src = [0, 510] - max diff = 16-bits
@@ -185,7 +170,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
{
// Note: 'total''s pairwise addition could be implemented similarly to
- // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+ // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired
// with the summation of 'sse' performed better on a Cortex-A15.
const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total'
const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
diff --git a/libvpx/vpx_dsp/arm/avg_pred_neon.c b/libvpx/vpx_dsp/arm/avg_pred_neon.c
new file mode 100644
index 000000000..1370ec2d2
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ if (width > 8) {
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; x += 16) {
+ const uint8x16_t p = vld1q_u8(pred + x);
+ const uint8x16_t r = vld1q_u8(ref + x);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+ vst1q_u8(comp + x, avg);
+ }
+ comp += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else {
+ int i;
+ for (i = 0; i < width * height; i += 16) {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+
+ if (width == 4) {
+ r = load_unaligned_u8q(ref, ref_stride);
+ ref += 4 * ref_stride;
+ } else {
+ const uint8x8_t r_0 = vld1_u8(ref);
+ const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+ assert(width == 8);
+ r = vcombine_u8(r_0, r_1);
+ ref += 2 * ref_stride;
+ }
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ }
+ }
+}
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
new file mode 100644
index 000000000..6b2bebd09
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
+// functions.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+ vpx_fdct16x16_c(input, output, stride);
+}
+
+#else
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+ b[0] = vld1q_s16(a);
+ a += stride;
+ b[1] = vld1q_s16(a);
+ a += stride;
+ b[2] = vld1q_s16(a);
+ a += stride;
+ b[3] = vld1q_s16(a);
+ a += stride;
+ b[4] = vld1q_s16(a);
+ a += stride;
+ b[5] = vld1q_s16(a);
+ a += stride;
+ b[6] = vld1q_s16(a);
+ a += stride;
+ b[7] = vld1q_s16(a);
+ a += stride;
+ b[8] = vld1q_s16(a);
+ a += stride;
+ b[9] = vld1q_s16(a);
+ a += stride;
+ b[10] = vld1q_s16(a);
+ a += stride;
+ b[11] = vld1q_s16(a);
+ a += stride;
+ b[12] = vld1q_s16(a);
+ a += stride;
+ b[13] = vld1q_s16(a);
+ a += stride;
+ b[14] = vld1q_s16(a);
+ a += stride;
+ b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+ store_s16q_to_tran_low(a, b[0]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[1]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[2]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[3]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[4]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[5]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[6]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/, const int pass) {
+ if (pass == 0) {
+ b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
+ b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
+ b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
+ b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
+ b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
+ b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
+ b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
+ b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+
+ b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
+ b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
+ b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
+ b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
+ b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
+ b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
+ b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
+ b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
+ } else {
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+ }
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+ const int16x8_t one = vdupq_n_s16(1);
+ a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+ a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+ a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+ a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+ a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+ a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+ a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+ a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+ a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+ a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+ a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+ a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+ a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+ a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+ a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+ a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_high_t c, int16x8_t *add,
+ int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t c0,
+ const tran_coef_t c1, int16x8_t *add,
+ int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
+ const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
+ const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
+ const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
+ const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
+ int16x8_t *b /*[8]*/) {
+ // Swap 16 bit elements.
+ const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements.
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+ const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+ vreinterpretq_s32_s16(c3.val[0]));
+ const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+ vreinterpretq_s32_s16(c3.val[1]));
+
+ // Swap 64 bit elements
+ const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+ const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+ const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+ const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+ b[0] = e0.val[0];
+ b[1] = e1.val[0];
+ b[2] = e2.val[0];
+ b[3] = e3.val[0];
+ b[4] = e0.val[1];
+ b[5] = e1.val[1];
+ b[6] = e2.val[1];
+ b[7] = e3.val[1];
+}
+
+// Main body of fdct16x16.
+static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
+ int16x8_t s[8];
+ int16x8_t x[4];
+ int16x8_t step[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], s[5]);
+ x[1] = vsubq_s16(s[4], s[5]);
+ x[2] = vsubq_s16(s[7], s[6]);
+ x[3] = vaddq_s16(s[7], s[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
+ // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+ butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+ // step 3
+ s[0] = vaddq_s16(in[8], s[3]);
+ s[1] = vaddq_s16(in[9], s[2]);
+ x[0] = vsubq_s16(in[9], s[2]);
+ x[1] = vsubq_s16(in[8], s[3]);
+ x[2] = vsubq_s16(in[15], s[4]);
+ x[3] = vsubq_s16(in[14], s[5]);
+ s[6] = vaddq_s16(in[14], s[5]);
+ s[7] = vaddq_s16(in[15], s[4]);
+
+ // step 4
+ // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+ // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+ butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+ butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+
+ // step 5
+ step[0] = vaddq_s16(s[0], s[1]);
+ step[1] = vsubq_s16(s[0], s[1]);
+ step[2] = vaddq_s16(x[1], s[2]);
+ step[3] = vsubq_s16(x[1], s[2]);
+ step[4] = vsubq_s16(x[2], s[5]);
+ step[5] = vaddq_s16(x[2], s[5]);
+ step[6] = vsubq_s16(s[7], s[6]);
+ step[7] = vaddq_s16(s[7], s[6]);
+
+ // step 6
+ // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+ // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+ // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+ // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
+ // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
+ // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+ // cospi_22_64)
+ // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+ // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+ butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+ &out[7]);
+ butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+ &out[15]);
+ butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+ &out[3]);
+ butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+ &out[11]);
+}
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x8_t temp0[16];
+ int16x8_t temp1[16];
+ int16x8_t temp2[16];
+ int16x8_t temp3[16];
+
+ // Left half.
+ load(input, stride, temp0);
+ cross_input(temp0, temp1, 0);
+ dct_body(temp1, temp0);
+
+ // Right half.
+ load(input + 8, stride, temp1);
+ cross_input(temp1, temp2, 0);
+ dct_body(temp2, temp1);
+
+ // Transpose top left and top right quarters into one contiguous location to
+ // process to the top half.
+ transpose_8x8(&temp0[0], &temp2[0]);
+ transpose_8x8(&temp1[0], &temp2[8]);
+ partial_round_shift(temp2);
+ cross_input(temp2, temp3, 1);
+ dct_body(temp3, temp2);
+ transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
+ &temp2[5], &temp2[6], &temp2[7]);
+ transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
+ &temp2[13], &temp2[14], &temp2[15]);
+ store(output, temp2);
+ store(output + 8, temp2 + 8);
+ output += 8 * 16;
+
+ // Transpose bottom left and bottom right quarters into one contiguous
+ // location to process to the bottom half.
+ transpose_8x8(&temp0[8], &temp1[0]);
+ transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+ &temp1[13], &temp1[14], &temp1[15]);
+ partial_round_shift(temp1);
+ cross_input(temp1, temp0, 1);
+ dct_body(temp0, temp1);
+ transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
+ &temp1[5], &temp1[6], &temp1[7]);
+ transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+ &temp1[13], &temp1[14], &temp1[15]);
+ store(output, temp1);
+ store(output + 8, temp1 + 8);
+}
+#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+ // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
new file mode 100644
index 000000000..e9cd34904
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -0,0 +1,1507 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ vpx_fdct32x32_c(input, output, stride);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct32x32_rd_c(input, output, stride);
+}
+
+#else
+
+#define LOAD_INCREMENT(src, stride, dest, index) \
+ do { \
+ dest[index] = vld1q_s16(src); \
+ src += stride; \
+ } while (0)
+
+#define ADD_S16(src, index0, index1, dest, index3) \
+ do { \
+ dest[index3] = vaddq_s16(src[index0], src[index1]); \
+ } while (0)
+
+#define ADD_SHIFT_S16(src, index0, index1) \
+ do { \
+ src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
+ } while (0)
+
+// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
+// middle
+// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+ const int16_t *a_end = a + 24 * stride;
+ int16x8_t c[8];
+
+ LOAD_INCREMENT(a, stride, b, 0);
+ LOAD_INCREMENT(a, stride, b, 1);
+ LOAD_INCREMENT(a, stride, b, 2);
+ LOAD_INCREMENT(a, stride, b, 3);
+ LOAD_INCREMENT(a, stride, b, 4);
+ LOAD_INCREMENT(a, stride, b, 5);
+ LOAD_INCREMENT(a, stride, b, 6);
+ LOAD_INCREMENT(a, stride, b, 7);
+
+ LOAD_INCREMENT(a_end, stride, b, 24);
+ LOAD_INCREMENT(a_end, stride, b, 25);
+ LOAD_INCREMENT(a_end, stride, b, 26);
+ LOAD_INCREMENT(a_end, stride, b, 27);
+ LOAD_INCREMENT(a_end, stride, b, 28);
+ LOAD_INCREMENT(a_end, stride, b, 29);
+ LOAD_INCREMENT(a_end, stride, b, 30);
+ LOAD_INCREMENT(a_end, stride, b, 31);
+
+ ADD_S16(b, 0, 31, c, 0);
+ ADD_S16(b, 1, 30, c, 1);
+ ADD_S16(b, 2, 29, c, 2);
+ ADD_S16(b, 3, 28, c, 3);
+ ADD_S16(b, 4, 27, c, 4);
+ ADD_S16(b, 5, 26, c, 5);
+ ADD_S16(b, 6, 25, c, 6);
+ ADD_S16(b, 7, 24, c, 7);
+
+ ADD_SHIFT_S16(b, 7, 24);
+ ADD_SHIFT_S16(b, 6, 25);
+ ADD_SHIFT_S16(b, 5, 26);
+ ADD_SHIFT_S16(b, 4, 27);
+ ADD_SHIFT_S16(b, 3, 28);
+ ADD_SHIFT_S16(b, 2, 29);
+ ADD_SHIFT_S16(b, 1, 30);
+ ADD_SHIFT_S16(b, 0, 31);
+
+ b[0] = vshlq_n_s16(c[0], 2);
+ b[1] = vshlq_n_s16(c[1], 2);
+ b[2] = vshlq_n_s16(c[2], 2);
+ b[3] = vshlq_n_s16(c[3], 2);
+ b[4] = vshlq_n_s16(c[4], 2);
+ b[5] = vshlq_n_s16(c[5], 2);
+ b[6] = vshlq_n_s16(c[6], 2);
+ b[7] = vshlq_n_s16(c[7], 2);
+
+ LOAD_INCREMENT(a, stride, b, 8);
+ LOAD_INCREMENT(a, stride, b, 9);
+ LOAD_INCREMENT(a, stride, b, 10);
+ LOAD_INCREMENT(a, stride, b, 11);
+ LOAD_INCREMENT(a, stride, b, 12);
+ LOAD_INCREMENT(a, stride, b, 13);
+ LOAD_INCREMENT(a, stride, b, 14);
+ LOAD_INCREMENT(a, stride, b, 15);
+ LOAD_INCREMENT(a, stride, b, 16);
+ LOAD_INCREMENT(a, stride, b, 17);
+ LOAD_INCREMENT(a, stride, b, 18);
+ LOAD_INCREMENT(a, stride, b, 19);
+ LOAD_INCREMENT(a, stride, b, 20);
+ LOAD_INCREMENT(a, stride, b, 21);
+ LOAD_INCREMENT(a, stride, b, 22);
+ LOAD_INCREMENT(a, stride, b, 23);
+
+ ADD_S16(b, 8, 23, c, 0);
+ ADD_S16(b, 9, 22, c, 1);
+ ADD_S16(b, 10, 21, c, 2);
+ ADD_S16(b, 11, 20, c, 3);
+ ADD_S16(b, 12, 19, c, 4);
+ ADD_S16(b, 13, 18, c, 5);
+ ADD_S16(b, 14, 17, c, 6);
+ ADD_S16(b, 15, 16, c, 7);
+
+ ADD_SHIFT_S16(b, 15, 16);
+ ADD_SHIFT_S16(b, 14, 17);
+ ADD_SHIFT_S16(b, 13, 18);
+ ADD_SHIFT_S16(b, 12, 19);
+ ADD_SHIFT_S16(b, 11, 20);
+ ADD_SHIFT_S16(b, 10, 21);
+ ADD_SHIFT_S16(b, 9, 22);
+ ADD_SHIFT_S16(b, 8, 23);
+
+ b[8] = vshlq_n_s16(c[0], 2);
+ b[9] = vshlq_n_s16(c[1], 2);
+ b[10] = vshlq_n_s16(c[2], 2);
+ b[11] = vshlq_n_s16(c[3], 2);
+ b[12] = vshlq_n_s16(c[4], 2);
+ b[13] = vshlq_n_s16(c[5], 2);
+ b[14] = vshlq_n_s16(c[6], 2);
+ b[15] = vshlq_n_s16(c[7], 2);
+}
+
+#undef LOAD_INCREMENT
+#undef ADD_S16
+#undef ADD_SHIFT_S16
+
+#define STORE_S16(src, index, dest) \
+ do { \
+ store_s16q_to_tran_low(dest, src[index]); \
+ dest += 8; \
+ } while (0);
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ STORE_S16(b, 0, a);
+ STORE_S16(b, 8, a);
+ STORE_S16(b, 16, a);
+ STORE_S16(b, 24, a);
+ STORE_S16(b, 1, a);
+ STORE_S16(b, 9, a);
+ STORE_S16(b, 17, a);
+ STORE_S16(b, 25, a);
+ STORE_S16(b, 2, a);
+ STORE_S16(b, 10, a);
+ STORE_S16(b, 18, a);
+ STORE_S16(b, 26, a);
+ STORE_S16(b, 3, a);
+ STORE_S16(b, 11, a);
+ STORE_S16(b, 19, a);
+ STORE_S16(b, 27, a);
+ STORE_S16(b, 4, a);
+ STORE_S16(b, 12, a);
+ STORE_S16(b, 20, a);
+ STORE_S16(b, 28, a);
+ STORE_S16(b, 5, a);
+ STORE_S16(b, 13, a);
+ STORE_S16(b, 21, a);
+ STORE_S16(b, 29, a);
+ STORE_S16(b, 6, a);
+ STORE_S16(b, 14, a);
+ STORE_S16(b, 22, a);
+ STORE_S16(b, 30, a);
+ STORE_S16(b, 7, a);
+ STORE_S16(b, 15, a);
+ STORE_S16(b, 23, a);
+ STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_high_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t constant0,
+ const tran_coef_t constant1,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
+ const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
+ const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
+ const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
+ const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ a[0] = vaddq_s16(in[0], in[15]);
+ a[1] = vaddq_s16(in[1], in[14]);
+ a[2] = vaddq_s16(in[2], in[13]);
+ a[3] = vaddq_s16(in[3], in[12]);
+ a[4] = vaddq_s16(in[4], in[11]);
+ a[5] = vaddq_s16(in[5], in[10]);
+ a[6] = vaddq_s16(in[6], in[9]);
+ a[7] = vaddq_s16(in[7], in[8]);
+
+ a[8] = vsubq_s16(in[7], in[8]);
+ a[9] = vsubq_s16(in[6], in[9]);
+ a[10] = vsubq_s16(in[5], in[10]);
+ a[11] = vsubq_s16(in[4], in[11]);
+ a[12] = vsubq_s16(in[3], in[12]);
+ a[13] = vsubq_s16(in[2], in[13]);
+ a[14] = vsubq_s16(in[1], in[14]);
+ a[15] = vsubq_s16(in[0], in[15]);
+
+ a[16] = in[16];
+ a[17] = in[17];
+ a[18] = in[18];
+ a[19] = in[19];
+
+ butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
+ butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
+ butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
+ butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
+
+ a[28] = in[28];
+ a[29] = in[29];
+ a[30] = in[30];
+ a[31] = in[31];
+
+ // Stage 3.
+ b[0] = vaddq_s16(a[0], a[7]);
+ b[1] = vaddq_s16(a[1], a[6]);
+ b[2] = vaddq_s16(a[2], a[5]);
+ b[3] = vaddq_s16(a[3], a[4]);
+
+ b[4] = vsubq_s16(a[3], a[4]);
+ b[5] = vsubq_s16(a[2], a[5]);
+ b[6] = vsubq_s16(a[1], a[6]);
+ b[7] = vsubq_s16(a[0], a[7]);
+
+ b[8] = a[8];
+ b[9] = a[9];
+
+ butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+ butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+ b[14] = a[14];
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(in[16], a[23]);
+ b[17] = vaddq_s16(in[17], a[22]);
+ b[18] = vaddq_s16(in[18], a[21]);
+ b[19] = vaddq_s16(in[19], a[20]);
+
+ b[20] = vsubq_s16(in[19], a[20]);
+ b[21] = vsubq_s16(in[18], a[21]);
+ b[22] = vsubq_s16(in[17], a[22]);
+ b[23] = vsubq_s16(in[16], a[23]);
+
+ b[24] = vsubq_s16(in[31], a[24]);
+ b[25] = vsubq_s16(in[30], a[25]);
+ b[26] = vsubq_s16(in[29], a[26]);
+ b[27] = vsubq_s16(in[28], a[27]);
+
+ b[28] = vaddq_s16(in[28], a[27]);
+ b[29] = vaddq_s16(in[29], a[26]);
+ b[30] = vaddq_s16(in[30], a[25]);
+ b[31] = vaddq_s16(in[31], a[24]);
+
+ // Stage 4.
+ a[0] = vaddq_s16(b[0], b[3]);
+ a[1] = vaddq_s16(b[1], b[2]);
+ a[2] = vsubq_s16(b[1], b[2]);
+ a[3] = vsubq_s16(b[0], b[3]);
+
+ a[4] = b[4];
+
+ butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+ a[7] = b[7];
+
+ a[8] = vaddq_s16(b[8], b[11]);
+ a[9] = vaddq_s16(b[9], b[10]);
+ a[10] = vsubq_s16(b[9], b[10]);
+ a[11] = vsubq_s16(b[8], b[11]);
+ a[12] = vsubq_s16(b[15], b[12]);
+ a[13] = vsubq_s16(b[14], b[13]);
+ a[14] = vaddq_s16(b[14], b[13]);
+ a[15] = vaddq_s16(b[15], b[12]);
+
+ a[16] = b[16];
+ a[17] = b[17];
+
+ butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
+ butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
+ butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
+ butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
+
+ a[22] = b[22];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[25] = b[25];
+
+ a[30] = b[30];
+ a[31] = b[31];
+
+ // Stage 5.
+ butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+ butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
+
+ b[4] = vaddq_s16(a[4], a[5]);
+ b[5] = vsubq_s16(a[4], a[5]);
+ b[6] = vsubq_s16(a[7], a[6]);
+ b[7] = vaddq_s16(a[7], a[6]);
+
+ b[8] = a[8];
+
+ butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
+ butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
+
+ b[11] = a[11];
+ b[12] = a[12];
+
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(a[19], a[16]);
+ b[17] = vaddq_s16(a[18], a[17]);
+ b[18] = vsubq_s16(a[17], a[18]);
+ b[19] = vsubq_s16(a[16], a[19]);
+ b[20] = vsubq_s16(a[23], a[20]);
+ b[21] = vsubq_s16(a[22], a[21]);
+ b[22] = vaddq_s16(a[21], a[22]);
+ b[23] = vaddq_s16(a[20], a[23]);
+ b[24] = vaddq_s16(a[27], a[24]);
+ b[25] = vaddq_s16(a[26], a[25]);
+ b[26] = vsubq_s16(a[25], a[26]);
+ b[27] = vsubq_s16(a[24], a[27]);
+ b[28] = vsubq_s16(a[31], a[28]);
+ b[29] = vsubq_s16(a[30], a[29]);
+ b[30] = vaddq_s16(a[29], a[30]);
+ b[31] = vaddq_s16(a[28], a[31]);
+
+ // Stage 6.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+ butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
+ butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
+
+ a[8] = vaddq_s16(b[8], b[9]);
+ a[9] = vsubq_s16(b[8], b[9]);
+ a[10] = vsubq_s16(b[11], b[10]);
+ a[11] = vaddq_s16(b[11], b[10]);
+ a[12] = vaddq_s16(b[12], b[13]);
+ a[13] = vsubq_s16(b[12], b[13]);
+ a[14] = vsubq_s16(b[15], b[14]);
+ a[15] = vaddq_s16(b[15], b[14]);
+
+ a[16] = b[16];
+ a[19] = b[19];
+ a[20] = b[20];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[27] = b[27];
+ a[28] = b[28];
+ a[31] = b[31];
+
+ butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
+ butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
+ butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
+
+ // Stage 7.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+ b[4] = a[4];
+ b[5] = a[5];
+ b[6] = a[6];
+ b[7] = a[7];
+
+ butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
+ butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
+ butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
+ butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
+
+ b[16] = vaddq_s16(a[16], a[17]);
+ b[17] = vsubq_s16(a[16], a[17]);
+ b[18] = vsubq_s16(a[19], a[18]);
+ b[19] = vaddq_s16(a[19], a[18]);
+ b[20] = vaddq_s16(a[20], a[21]);
+ b[21] = vsubq_s16(a[20], a[21]);
+ b[22] = vsubq_s16(a[23], a[22]);
+ b[23] = vaddq_s16(a[23], a[22]);
+ b[24] = vaddq_s16(a[24], a[25]);
+ b[25] = vsubq_s16(a[24], a[25]);
+ b[26] = vsubq_s16(a[27], a[26]);
+ b[27] = vaddq_s16(a[27], a[26]);
+ b[28] = vaddq_s16(a[28], a[29]);
+ b[29] = vsubq_s16(a[28], a[29]);
+ b[30] = vsubq_s16(a[31], a[30]);
+ b[31] = vaddq_s16(a[31], a[30]);
+
+ // Final stage.
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[0] = sub_round_shift(b[0]);
+ out[16] = sub_round_shift(b[1]);
+ out[8] = sub_round_shift(b[2]);
+ out[24] = sub_round_shift(b[3]);
+ out[4] = sub_round_shift(b[4]);
+ out[20] = sub_round_shift(b[5]);
+ out[12] = sub_round_shift(b[6]);
+ out[28] = sub_round_shift(b[7]);
+ out[2] = sub_round_shift(b[8]);
+ out[18] = sub_round_shift(b[9]);
+ out[10] = sub_round_shift(b[10]);
+ out[26] = sub_round_shift(b[11]);
+ out[6] = sub_round_shift(b[12]);
+ out[22] = sub_round_shift(b[13]);
+ out[14] = sub_round_shift(b[14]);
+ out[30] = sub_round_shift(b[15]);
+
+ butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
+ out[1] = sub_round_shift(a[1]);
+ out[31] = sub_round_shift(a[31]);
+
+ butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
+ out[17] = sub_round_shift(a[17]);
+ out[15] = sub_round_shift(a[15]);
+
+ butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
+ out[9] = sub_round_shift(a[9]);
+ out[23] = sub_round_shift(a[23]);
+
+ butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
+ out[25] = sub_round_shift(a[25]);
+ out[7] = sub_round_shift(a[7]);
+
+ butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
+ out[5] = sub_round_shift(a[5]);
+ out[27] = sub_round_shift(a[27]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
+ out[21] = sub_round_shift(a[21]);
+ out[11] = sub_round_shift(a[11]);
+
+ butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
+ out[13] = sub_round_shift(a[13]);
+ out[19] = sub_round_shift(a[19]);
+
+ butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
+ out[29] = sub_round_shift(a[29]);
+ out[3] = sub_round_shift(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element) \
+ do { \
+ dst##_lo[element] = src##_lo[element]; \
+ dst##_hi[element] = src##_hi[element]; \
+ } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
+ do { \
+ c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
+ c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+ } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+ do { \
+ temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
+ temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
+ c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
+ c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
+ } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+// Like butterfly_one_coeff, but don't narrow results.
+static INLINE void butterfly_one_coeff_s16_s32(
+ const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
+ add_index, sub_index) \
+ do { \
+ butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+ &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+// Like butterfly_one_coeff, but with s32.
+static INLINE void butterfly_one_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
+ const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
+ const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
+ const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
+ const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+ sub_index) \
+ do { \
+ butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ constant, &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+// Like butterfly_two_coeff, but with s32.
+static INLINE void butterfly_two_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
+ const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
+ const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
+ const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
+ const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
+ const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
+ right_constant, b, add_index, sub_index) \
+ do { \
+ butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ left_constant, right_constant, &b##_lo[add_index], \
+ &b##_hi[add_index], &b##_lo[sub_index], \
+ &b##_hi[sub_index]); \
+ } while (0)
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
+ const int32x4_t a_hi) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+ const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+ const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+ const int16x4_t b_lo =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+ const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+ const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+ const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+ const int16x4_t b_hi =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+ return vcombine_s16(b_lo, b_hi);
+}
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+ int32x4_t c_lo[32];
+ int32x4_t c_hi[32];
+ int32x4_t d_lo[32];
+ int32x4_t d_hi[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+ b[18] = a[18];
+ b[19] = a[19];
+
+ butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+ b[28] = a[28];
+ b[29] = a[29];
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 3. With extreme values for input this calculation rolls over int16_t.
+ // The sources for b[0] get added multiple times and, through testing, have
+ // been shown to overflow starting here.
+ ADD_S16_S32(b, 0, 7, c, 0);
+ ADD_S16_S32(b, 1, 6, c, 1);
+ ADD_S16_S32(b, 2, 5, c, 2);
+ ADD_S16_S32(b, 3, 4, c, 3);
+ SUB_S16_S32(b, 3, 4, c, 4);
+ SUB_S16_S32(b, 2, 5, c, 5);
+ SUB_S16_S32(b, 1, 6, c, 6);
+ SUB_S16_S32(b, 0, 7, c, 7);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+ BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ ADD_S16_S32(b, 16, 23, c, 16);
+ ADD_S16_S32(b, 17, 22, c, 17);
+ ADD_S16_S32(b, 18, 21, c, 18);
+ ADD_S16_S32(b, 19, 20, c, 19);
+ SUB_S16_S32(b, 19, 20, c, 20);
+ SUB_S16_S32(b, 18, 21, c, 21);
+ SUB_S16_S32(b, 17, 22, c, 22);
+ SUB_S16_S32(b, 16, 23, c, 23);
+ SUB_S16_S32(b, 31, 24, c, 24);
+ SUB_S16_S32(b, 30, 25, c, 25);
+ SUB_S16_S32(b, 29, 26, c, 26);
+ SUB_S16_S32(b, 28, 27, c, 27);
+ ADD_S16_S32(b, 28, 27, c, 28);
+ ADD_S16_S32(b, 29, 26, c, 29);
+ ADD_S16_S32(b, 30, 25, c, 30);
+ ADD_S16_S32(b, 31, 24, c, 31);
+
+ // Stage 4.
+ ADD_S32(c, 0, 3, d, 0);
+ ADD_S32(c, 1, 2, d, 1);
+ SUB_S32(c, 1, 2, d, 2);
+ SUB_S32(c, 0, 3, d, 3);
+
+ PASS_THROUGH(c, d, 4);
+
+ BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+ PASS_THROUGH(c, d, 7);
+
+ ADDW_S16_S32(c, 11, a, 8, d, 8);
+ ADDW_S16_S32(c, 10, a, 9, d, 9);
+ SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+ SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+ SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+ SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+ ADDW_S16_S32(c, 13, b, 14, d, 14);
+ ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 17);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
+ BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
+ BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
+
+ PASS_THROUGH(c, d, 22);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 25);
+
+ PASS_THROUGH(c, d, 30);
+ PASS_THROUGH(c, d, 31);
+
+ // Stage 5.
+ BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+ BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
+
+ ADD_S32(d, 4, 5, c, 4);
+ SUB_S32(d, 4, 5, c, 5);
+ SUB_S32(d, 7, 6, c, 6);
+ ADD_S32(d, 7, 6, c, 7);
+
+ PASS_THROUGH(d, c, 8);
+
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
+ BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
+
+ PASS_THROUGH(d, c, 11);
+ PASS_THROUGH(d, c, 12);
+ PASS_THROUGH(d, c, 15);
+
+ ADD_S32(d, 16, 19, c, 16);
+ ADD_S32(d, 17, 18, c, 17);
+ SUB_S32(d, 17, 18, c, 18);
+ SUB_S32(d, 16, 19, c, 19);
+ SUB_S32(d, 23, 20, c, 20);
+ SUB_S32(d, 22, 21, c, 21);
+ ADD_S32(d, 22, 21, c, 22);
+ ADD_S32(d, 23, 20, c, 23);
+ ADD_S32(d, 24, 27, c, 24);
+ ADD_S32(d, 25, 26, c, 25);
+ SUB_S32(d, 25, 26, c, 26);
+ SUB_S32(d, 24, 27, c, 27);
+ SUB_S32(d, 31, 28, c, 28);
+ SUB_S32(d, 30, 29, c, 29);
+ ADD_S32(d, 30, 29, c, 30);
+ ADD_S32(d, 31, 28, c, 31);
+
+ // Stage 6.
+ PASS_THROUGH(c, d, 0);
+ PASS_THROUGH(c, d, 1);
+ PASS_THROUGH(c, d, 2);
+ PASS_THROUGH(c, d, 3);
+
+ BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
+ BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
+
+ ADD_S32(c, 8, 9, d, 8);
+ SUB_S32(c, 8, 9, d, 9);
+ SUB_S32(c, 11, 10, d, 10);
+ ADD_S32(c, 11, 10, d, 11);
+ ADD_S32(c, 12, 13, d, 12);
+ SUB_S32(c, 12, 13, d, 13);
+ SUB_S32(c, 15, 14, d, 14);
+ ADD_S32(c, 15, 14, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 19);
+ PASS_THROUGH(c, d, 20);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 27);
+ PASS_THROUGH(c, d, 28);
+ PASS_THROUGH(c, d, 31);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
+ BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
+ BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
+
+ // Stage 7.
+ PASS_THROUGH(d, c, 0);
+ PASS_THROUGH(d, c, 1);
+ PASS_THROUGH(d, c, 2);
+ PASS_THROUGH(d, c, 3);
+ PASS_THROUGH(d, c, 4);
+ PASS_THROUGH(d, c, 5);
+ PASS_THROUGH(d, c, 6);
+ PASS_THROUGH(d, c, 7);
+
+ BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
+ BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
+
+ ADD_S32(d, 16, 17, c, 16);
+ SUB_S32(d, 16, 17, c, 17);
+ SUB_S32(d, 19, 18, c, 18);
+ ADD_S32(d, 19, 18, c, 19);
+ ADD_S32(d, 20, 21, c, 20);
+ SUB_S32(d, 20, 21, c, 21);
+ SUB_S32(d, 23, 22, c, 22);
+ ADD_S32(d, 23, 22, c, 23);
+ ADD_S32(d, 24, 25, c, 24);
+ SUB_S32(d, 24, 25, c, 25);
+ SUB_S32(d, 27, 26, c, 26);
+ ADD_S32(d, 27, 26, c, 27);
+ ADD_S32(d, 28, 29, c, 28);
+ SUB_S32(d, 28, 29, c, 29);
+ SUB_S32(d, 31, 30, c, 30);
+ ADD_S32(d, 31, 30, c, 31);
+
+ // Final stage.
+ // Roll rounding into this function so we can pass back int16x8.
+
+ out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
+ out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
+
+ out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
+ out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
+ out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
+ out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
+ out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
+
+ out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
+ out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
+ out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
+ out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
+
+ out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
+ out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
+ out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
+ out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
+ out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
+ out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
+ out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
+ out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
+ out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
+ out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
+ out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
+
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
+ out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
+ out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
+
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
+ out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
+ out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
+
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
+ out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
+ out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
+
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
+ out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
+ out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
+
+ BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
+ out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
+ out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+ b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+ b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+ b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+ b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+ b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+ b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+ b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+ b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+ b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+ b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+ b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+ b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+ b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+ b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+ b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+ b[16] = add_round_shift_s16(a[16]);
+ b[17] = add_round_shift_s16(a[17]);
+ b[18] = add_round_shift_s16(a[18]);
+ b[19] = add_round_shift_s16(a[19]);
+
+ butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+ b[20] = add_round_shift_s16(b[20]);
+ b[21] = add_round_shift_s16(b[21]);
+ b[22] = add_round_shift_s16(b[22]);
+ b[23] = add_round_shift_s16(b[23]);
+ b[24] = add_round_shift_s16(b[24]);
+ b[25] = add_round_shift_s16(b[25]);
+ b[26] = add_round_shift_s16(b[26]);
+ b[27] = add_round_shift_s16(b[27]);
+
+ b[28] = add_round_shift_s16(a[28]);
+ b[29] = add_round_shift_s16(a[29]);
+ b[30] = add_round_shift_s16(a[30]);
+ b[31] = add_round_shift_s16(a[31]);
+
+ // Stage 3.
+ a[0] = vaddq_s16(b[0], b[7]);
+ a[1] = vaddq_s16(b[1], b[6]);
+ a[2] = vaddq_s16(b[2], b[5]);
+ a[3] = vaddq_s16(b[3], b[4]);
+
+ a[4] = vsubq_s16(b[3], b[4]);
+ a[5] = vsubq_s16(b[2], b[5]);
+ a[6] = vsubq_s16(b[1], b[6]);
+ a[7] = vsubq_s16(b[0], b[7]);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+ butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[16], b[23]);
+ a[17] = vaddq_s16(b[17], b[22]);
+ a[18] = vaddq_s16(b[18], b[21]);
+ a[19] = vaddq_s16(b[19], b[20]);
+
+ a[20] = vsubq_s16(b[19], b[20]);
+ a[21] = vsubq_s16(b[18], b[21]);
+ a[22] = vsubq_s16(b[17], b[22]);
+ a[23] = vsubq_s16(b[16], b[23]);
+
+ a[24] = vsubq_s16(b[31], b[24]);
+ a[25] = vsubq_s16(b[30], b[25]);
+ a[26] = vsubq_s16(b[29], b[26]);
+ a[27] = vsubq_s16(b[28], b[27]);
+
+ a[28] = vaddq_s16(b[28], b[27]);
+ a[29] = vaddq_s16(b[29], b[26]);
+ a[30] = vaddq_s16(b[30], b[25]);
+ a[31] = vaddq_s16(b[31], b[24]);
+
+ // Stage 4.
+ b[0] = vaddq_s16(a[0], a[3]);
+ b[1] = vaddq_s16(a[1], a[2]);
+ b[2] = vsubq_s16(a[1], a[2]);
+ b[3] = vsubq_s16(a[0], a[3]);
+
+ b[4] = a[4];
+
+ butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+ b[7] = a[7];
+
+ b[8] = vaddq_s16(a[8], a[11]);
+ b[9] = vaddq_s16(a[9], a[10]);
+ b[10] = vsubq_s16(a[9], a[10]);
+ b[11] = vsubq_s16(a[8], a[11]);
+ b[12] = vsubq_s16(a[15], a[12]);
+ b[13] = vsubq_s16(a[14], a[13]);
+ b[14] = vaddq_s16(a[14], a[13]);
+ b[15] = vaddq_s16(a[15], a[12]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+
+ butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
+ butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
+ butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
+ butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
+
+ b[22] = a[22];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[25] = a[25];
+
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 5.
+ butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+ butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
+
+ a[4] = vaddq_s16(b[4], b[5]);
+ a[5] = vsubq_s16(b[4], b[5]);
+ a[6] = vsubq_s16(b[7], b[6]);
+ a[7] = vaddq_s16(b[7], b[6]);
+
+ a[8] = b[8];
+
+ butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
+ butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
+
+ a[11] = b[11];
+ a[12] = b[12];
+
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[19], b[16]);
+ a[17] = vaddq_s16(b[18], b[17]);
+ a[18] = vsubq_s16(b[17], b[18]);
+ a[19] = vsubq_s16(b[16], b[19]);
+ a[20] = vsubq_s16(b[23], b[20]);
+ a[21] = vsubq_s16(b[22], b[21]);
+ a[22] = vaddq_s16(b[21], b[22]);
+ a[23] = vaddq_s16(b[20], b[23]);
+ a[24] = vaddq_s16(b[27], b[24]);
+ a[25] = vaddq_s16(b[26], b[25]);
+ a[26] = vsubq_s16(b[25], b[26]);
+ a[27] = vsubq_s16(b[24], b[27]);
+ a[28] = vsubq_s16(b[31], b[28]);
+ a[29] = vsubq_s16(b[30], b[29]);
+ a[30] = vaddq_s16(b[29], b[30]);
+ a[31] = vaddq_s16(b[28], b[31]);
+
+ // Stage 6.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+
+ butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
+ butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
+
+ b[8] = vaddq_s16(a[8], a[9]);
+ b[9] = vsubq_s16(a[8], a[9]);
+ b[10] = vsubq_s16(a[11], a[10]);
+ b[11] = vaddq_s16(a[11], a[10]);
+ b[12] = vaddq_s16(a[12], a[13]);
+ b[13] = vsubq_s16(a[12], a[13]);
+ b[14] = vsubq_s16(a[15], a[14]);
+ b[15] = vaddq_s16(a[15], a[14]);
+
+ b[16] = a[16];
+ b[19] = a[19];
+ b[20] = a[20];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[27] = a[27];
+ b[28] = a[28];
+ b[31] = a[31];
+
+ butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
+ butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
+
+ butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
+ butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
+
+ // Stage 7.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+
+ butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
+ butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
+ butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
+ butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
+
+ a[16] = vaddq_s16(b[16], b[17]);
+ a[17] = vsubq_s16(b[16], b[17]);
+ a[18] = vsubq_s16(b[19], b[18]);
+ a[19] = vaddq_s16(b[19], b[18]);
+ a[20] = vaddq_s16(b[20], b[21]);
+ a[21] = vsubq_s16(b[20], b[21]);
+ a[22] = vsubq_s16(b[23], b[22]);
+ a[23] = vaddq_s16(b[23], b[22]);
+ a[24] = vaddq_s16(b[24], b[25]);
+ a[25] = vsubq_s16(b[24], b[25]);
+ a[26] = vsubq_s16(b[27], b[26]);
+ a[27] = vaddq_s16(b[27], b[26]);
+ a[28] = vaddq_s16(b[28], b[29]);
+ a[29] = vsubq_s16(b[28], b[29]);
+ a[30] = vsubq_s16(b[31], b[30]);
+ a[31] = vaddq_s16(b[31], b[30]);
+
+ // Final stage.
+ out[0] = a[0];
+ out[16] = a[1];
+ out[8] = a[2];
+ out[24] = a[3];
+ out[4] = a[4];
+ out[20] = a[5];
+ out[12] = a[6];
+ out[28] = a[7];
+ out[2] = a[8];
+ out[18] = a[9];
+ out[10] = a[10];
+ out[26] = a[11];
+ out[6] = a[12];
+ out[22] = a[13];
+ out[14] = a[14];
+ out[30] = a[15];
+
+ butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
+ butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
+ &out[15]);
+ butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
+ butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
+ butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
+ butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
+ &out[11]);
+ butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
+ &out[19]);
+ butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+// TODO(johannkoenig): share with other fdcts.
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+ // Swap 16 bit elements.
+ const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements.
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+ const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+ vreinterpretq_s32_s16(c3.val[0]));
+ const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+ vreinterpretq_s32_s16(c3.val[1]));
+
+ // Swap 64 bit elements
+ const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+ const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+ const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+ const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+ b[0] = e0.val[0];
+ b[1] = e1.val[0];
+ b[2] = e2.val[0];
+ b[3] = e3.val[0];
+ b[4] = e0.val[1];
+ b[5] = e1.val[1];
+ b[6] = e2.val[1];
+ b[7] = e3.val[1];
+}
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+
+ // Process in 8x32 columns.
+ load(input, stride, temp0);
+ dct_body_first_pass(temp0, temp1);
+
+ load(input + 8, stride, temp0);
+ dct_body_first_pass(temp0, temp2);
+
+ load(input + 16, stride, temp0);
+ dct_body_first_pass(temp0, temp3);
+
+ load(input + 24, stride, temp0);
+ dct_body_first_pass(temp0, temp4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_8x8(&temp1[0], &temp0[0]);
+ transpose_8x8(&temp2[0], &temp0[8]);
+ transpose_8x8(&temp3[0], &temp0[16]);
+ transpose_8x8(&temp4[0], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output, temp5);
+
+ // Second row of 8x32.
+ transpose_8x8(&temp1[8], &temp0[0]);
+ transpose_8x8(&temp2[8], &temp0[8]);
+ transpose_8x8(&temp3[8], &temp0[16]);
+ transpose_8x8(&temp4[8], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 8 * 32, temp5);
+
+ // Third row of 8x32
+ transpose_8x8(&temp1[16], &temp0[0]);
+ transpose_8x8(&temp2[16], &temp0[8]);
+ transpose_8x8(&temp3[16], &temp0[16]);
+ transpose_8x8(&temp4[16], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 16 * 32, temp5);
+
+ // Final row of 8x32.
+ transpose_8x8(&temp1[24], &temp0[0]);
+ transpose_8x8(&temp2[24], &temp0[8]);
+ transpose_8x8(&temp3[24], &temp0[16]);
+ transpose_8x8(&temp4[24], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 24 * 32, temp5);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+
+ // Process in 8x32 columns.
+ load(input, stride, temp0);
+ dct_body_first_pass(temp0, temp1);
+
+ load(input + 8, stride, temp0);
+ dct_body_first_pass(temp0, temp2);
+
+ load(input + 16, stride, temp0);
+ dct_body_first_pass(temp0, temp3);
+
+ load(input + 24, stride, temp0);
+ dct_body_first_pass(temp0, temp4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_8x8(&temp1[0], &temp0[0]);
+ transpose_8x8(&temp2[0], &temp0[8]);
+ transpose_8x8(&temp3[0], &temp0[16]);
+ transpose_8x8(&temp4[0], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output, temp5);
+
+ // Second row of 8x32.
+ transpose_8x8(&temp1[8], &temp0[0]);
+ transpose_8x8(&temp2[8], &temp0[8]);
+ transpose_8x8(&temp3[8], &temp0[16]);
+ transpose_8x8(&temp4[8], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 8 * 32, temp5);
+
+ // Third row of 8x32
+ transpose_8x8(&temp1[16], &temp0[0]);
+ transpose_8x8(&temp2[16], &temp0[8]);
+ transpose_8x8(&temp3[16], &temp0[16]);
+ transpose_8x8(&temp4[16], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 16 * 32, temp5);
+
+ // Final row of 8x32.
+ transpose_8x8(&temp1[24], &temp0[0]);
+ transpose_8x8(&temp2[24], &temp0[8]);
+ transpose_8x8(&temp3[24], &temp0[16]);
+ transpose_8x8(&temp4[24], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 24 * 32, temp5);
+}
+#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+ // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct_neon.c
index fe78f3f51..04646ed2e 100644
--- a/libvpx/vpx_dsp/arm/fdct_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct_neon.c
@@ -50,8 +50,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
// Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
- const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64);
- const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
// fdct_round_shift
int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
@@ -59,13 +59,11 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
// s_3 * cospi_8_64 + s_2 * cospi_24_64
// s_3 * cospi_24_64 - s_2 * cospi_8_64
- const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64);
- const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64);
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
- const int32x4_t temp3 =
- vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64);
- const int32x4_t temp4 =
- vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64);
+ const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+ const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
// fdct_round_shift
int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
diff --git a/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
new file mode 100644
index 000000000..e73de41d7
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE tran_low_t get_lane(const int32x2_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ return vget_lane_s32(a, 0);
+#else
+ return vget_lane_s16(vreinterpret_s16_s32(a), 0);
+#endif // CONFIG_VP9_HIGHBITDETPH
+}
+
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x4_t a0, a1, a2, a3;
+ int16x8_t b0, b1;
+ int16x8_t c;
+ int32x2_t d;
+
+ a0 = vld1_s16(input);
+ input += stride;
+ a1 = vld1_s16(input);
+ input += stride;
+ a2 = vld1_s16(input);
+ input += stride;
+ a3 = vld1_s16(input);
+
+ b0 = vcombine_s16(a0, a1);
+ b1 = vcombine_s16(a2, a3);
+
+ c = vaddq_s16(b0, b1);
+
+ d = horizontal_add_int16x8(c);
+
+ output[0] = get_lane(vshl_n_s32(d, 1));
+ output[1] = 0;
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+
+ output[0] = get_lane(horizontal_add_int16x8(sum));
+ output[1] = 0;
+}
+
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int r;
+ int16x8_t left = vld1q_s16(input);
+ int16x8_t right = vld1q_s16(input + 8);
+ int32x2_t sum;
+ input += stride;
+
+ for (r = 1; r < 16; ++r) {
+ const int16x8_t a = vld1q_s16(input);
+ const int16x8_t b = vld1q_s16(input + 8);
+ input += stride;
+ left = vaddq_s16(left, a);
+ right = vaddq_s16(right, b);
+ }
+
+ sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
+
+ output[0] = get_lane(vshr_n_s32(sum, 1));
+ output[1] = 0;
+}
+
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int r;
+ int16x8_t a0 = vld1q_s16(input);
+ int16x8_t a1 = vld1q_s16(input + 8);
+ int16x8_t a2 = vld1q_s16(input + 16);
+ int16x8_t a3 = vld1q_s16(input + 24);
+ int32x2_t sum;
+ input += stride;
+
+ for (r = 1; r < 32; ++r) {
+ const int16x8_t b0 = vld1q_s16(input);
+ const int16x8_t b1 = vld1q_s16(input + 8);
+ const int16x8_t b2 = vld1q_s16(input + 16);
+ const int16x8_t b3 = vld1q_s16(input + 24);
+ input += stride;
+ a0 = vaddq_s16(a0, b0);
+ a1 = vaddq_s16(a1, b1);
+ a2 = vaddq_s16(a2, b2);
+ a3 = vaddq_s16(a3, b3);
+ }
+
+ sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
+ sum = vadd_s32(sum, horizontal_add_int16x8(a2));
+ sum = vadd_s32(sum, horizontal_add_int16x8(a3));
+ output[0] = get_lane(vshr_n_s32(sum, 3));
+ output[1] = 0;
+}
diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index c449b4660..8049277b1 100644
--- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -48,18 +48,18 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
- int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
- int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
- int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
- v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
- v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
- v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
- v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
- v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
- v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -77,10 +77,10 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
// Stage 2
v_x0 = vsubq_s16(v_s6, v_s5);
v_x1 = vaddq_s16(v_s6, v_s5);
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -95,22 +95,22 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
v_x3 = vaddq_s16(v_s7, cd);
}
// Stage 4
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
- v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
- v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
- v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
- v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
- v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
- v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
- v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
- v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
- v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
- v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -207,24 +207,3 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
store_s16q_to_tran_low(final_output + 7 * 8, input_7);
}
}
-
-void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
- int r;
- int16x8_t sum = vld1q_s16(&input[0]);
- for (r = 1; r < 8; ++r) {
- const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
- sum = vaddq_s16(sum, input_00);
- }
- {
- const int32x4_t a = vpaddlq_s16(sum);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
-#if CONFIG_VP9_HIGHBITDEPTH
- output[0] = vget_lane_s32(c, 0);
-#else
- output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
-#endif
- output[1] = 0;
- }
-}
diff --git a/libvpx/vpx_dsp/arm/hadamard_neon.c b/libvpx/vpx_dsp/arm/hadamard_neon.c
index 79bedd848..523a63c6f 100644
--- a/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -47,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
*a7 = vaddq_s16(c1, c5);
}
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
int16x8_t a0 = vld1q_s16(src_diff);
int16x8_t a1 = vld1q_s16(src_diff + src_stride);
@@ -76,7 +76,7 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
store_s16q_to_tran_low(coeff + 56, a7);
}
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
int i;
diff --git a/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
index 98e42cd25..5358839b5 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -1410,10 +1410,10 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- const tran_low_t out0 =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- const tran_low_t out1 =
- HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
int i;
diff --git a/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
index 63eb49678..c1354c0c1 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -61,10 +61,10 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- const tran_low_t out0 =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- const tran_low_t out1 =
- HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
int i;
diff --git a/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
index 20b09f683..1418a75a1 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -54,10 +54,10 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
- const tran_low_t out0 =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- const tran_low_t out1 =
- HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
const int16x8_t dc = vdupq_n_s16(a1);
diff --git a/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index 6687e7649..dd90134a6 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -38,10 +38,10 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- const tran_low_t out0 =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- const tran_low_t out1 =
- HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
const int16x8_t dc = vdupq_n_s16(a1);
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 74345e1fa..c46c01631 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -17,8 +17,9 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_ports/mem.h"
-static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
- int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) {
+static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3) {
*s0 = vld1_s16(s);
s += p;
*s1 = vld1_s16(s);
@@ -28,8 +29,9 @@ static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
*s3 = vld1_s16(s);
}
-static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
- uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) {
+static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3) {
*s0 = vld1q_u16(s);
s += p;
*s1 = vld1q_u16(s);
@@ -39,10 +41,11 @@ static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
*s3 = vld1q_u16(s);
}
-static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
- int16x8_t *s1, int16x8_t *s2, int16x8_t *s3,
- int16x8_t *s4, int16x8_t *s5, int16x8_t *s6,
- int16x8_t *s7) {
+static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7) {
*s0 = vld1q_s16(s);
s += p;
*s1 = vld1q_s16(s);
@@ -60,11 +63,11 @@ static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
*s7 = vld1q_s16(s);
}
-static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
- const uint16x8_t s1, const uint16x8_t s2,
- const uint16x8_t s3, const uint16x8_t s4,
- const uint16x8_t s5, const uint16x8_t s6,
- const uint16x8_t s7) {
+static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3,
+ const uint16x8_t s4, const uint16x8_t s5,
+ const uint16x8_t s6, const uint16x8_t s7) {
vst1q_u16(s, s0);
s += p;
vst1q_u16(s, s1);
@@ -82,16 +85,15 @@ static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
vst1q_u16(s, s7);
}
-static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t filters) {
+static INLINE int32x4_t highbd_convolve8_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
- int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sum;
- sum = vmlal_lane_s16(sum, s0, filters_lo, 0);
+ sum = vmull_lane_s16(s0, filters_lo, 0);
sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
@@ -102,19 +104,17 @@ static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
return sum;
}
-static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t filters,
- const uint16x8_t max) {
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filters, const uint16x8_t max) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
- int32x4_t sum0 = vdupq_n_s32(0);
- int32x4_t sum1 = vdupq_n_s32(0);
+ int32x4_t sum0, sum1;
uint16x8_t d;
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0);
+ sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
@@ -122,7 +122,7 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0);
+ sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
@@ -137,15 +137,14 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
if (x_step_q4 != 16) {
- vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
} else {
- const int16x8_t filters = vld1q_s16(filter_x);
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
@@ -182,10 +181,10 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
transpose_s16_4x4d(&s7, &s8, &s9, &s10);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -241,10 +240,11 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
transpose_u16_8x4(&d0, &d1, &d2, &d3);
vst1_u16(dst, vget_low_u16(d0));
@@ -302,14 +302,22 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
&s12, &s13, &s14);
transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+ max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+ max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+ max);
+ d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+ max);
+ d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+ max);
+ d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+ max);
+ d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+ max);
+ d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+ filters, max);
transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
@@ -337,15 +345,15 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w, int h, int bd) {
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
if (x_step_q4 != 16) {
- vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
} else {
- const int16x8_t filters = vld1q_s16(filter_x);
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
@@ -382,10 +390,10 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
transpose_s16_4x4d(&s7, &s8, &s9, &s10);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -448,10 +456,11 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+ t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ t3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
transpose_u16_8x4(&t0, &t1, &t2, &t3);
d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
@@ -522,14 +531,22 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
&s12, &s13, &s14);
transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+ max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+ max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+ max);
+ d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+ max);
+ d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+ max);
+ d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+ max);
+ d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+ max);
+ d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+ filters, max);
transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
@@ -566,15 +583,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
if (y_step_q4 != 16) {
- vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
} else {
- const int16x8_t filters = vld1q_s16(filter_y);
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
assert(!((intptr_t)dst & 3));
@@ -620,10 +636,10 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -698,10 +714,11 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
vst1q_u16(d, d0);
d += dst_stride;
@@ -732,15 +749,15 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
if (y_step_q4 != 16) {
- vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
} else {
- const int16x8_t filters = vld1q_s16(filter_y);
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
assert(!((intptr_t)dst & 3));
@@ -786,10 +803,10 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -872,10 +889,11 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+ t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ t3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
d0 = vld1q_u16(d + 0 * dst_stride);
d1 = vld1q_u16(d + 1 * dst_stride);
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
index 4ff3dea08..765a054f8 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -15,13 +15,14 @@
void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
(void)bd;
if (w < 8) { // avg4
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 61712d48e..9d2752e09 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -15,13 +15,14 @@
void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
(void)bd;
if (w < 8) { // copy4
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
index f769620a4..414ade353 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -15,12 +15,11 @@
void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h, int bd) {
- const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
- DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+ uint16_t temp[64 * 136];
const int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
@@ -29,22 +28,21 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
* buffer which has lots of extra room and is subsequently discarded this is
* safe if somewhat less than ideal. */
vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
- filter_x, x_step_q4, filter_y, y_step_q4, w,
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
intermediate_height, bd);
/* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
- const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
- DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+ uint16_t temp[64 * 136];
const int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
@@ -52,8 +50,9 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
* to average the values after both passes.
*/
vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
- filter_x, x_step_q4, filter_y, y_step_q4, w,
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
intermediate_height, bd);
- vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
}
diff --git a/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
index 968bc5cc3..bf5192a68 100644
--- a/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -32,7 +32,8 @@ static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
- const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
diff --git a/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
index 604d82abd..8920b9336 100644
--- a/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -39,7 +39,8 @@ static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
int i;
- const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
diff --git a/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index 21d21b033..a14b89543 100644
--- a/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -32,7 +32,8 @@ static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
- const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
const int16x8_t dc = vdupq_n_s16(a1);
diff --git a/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
index 7bcce913b..ce9b45958 100644
--- a/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -36,7 +36,8 @@ static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
- const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
diff --git a/libvpx/vpx_dsp/arm/idct_neon.h b/libvpx/vpx_dsp/arm/idct_neon.h
index 0fc1de8e4..6ed02af5a 100644
--- a/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/libvpx/vpx_dsp/arm/idct_neon.h
@@ -18,7 +18,7 @@
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
-DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
+static const int16_t kCospi[16] = {
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
@@ -29,7 +29,7 @@ DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
};
-DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
+static const int32_t kCospi32[16] = {
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
diff --git a/libvpx/vpx_dsp/arm/mem_neon.h b/libvpx/vpx_dsp/arm/mem_neon.h
index 37b89b276..4efad5333 100644
--- a/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/libvpx/vpx_dsp/arm/mem_neon.h
@@ -79,6 +79,32 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
memcpy(buf, &a, 4);
}
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x2_t a_u32 = vdup_n_u32(0);
+ if (stride == 4) return vld1_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vld1_lane_u32(&a, a_u32, 0);
+ memcpy(&a, buf, 4);
+ a_u32 = vld1_lane_u32(&a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+ const uint8x8_t a) {
+ const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+ if (stride == 4) {
+ vst1_u8(buf, a);
+ return;
+ }
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+ buf += stride;
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
// Load 4 sets of 4 bytes when alignment is not guaranteed.
static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
uint32_t a;
diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c
new file mode 100644
index 000000000..a0a1e6dd5
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ (void)scan_ptr;
+ (void)skip_block;
+ assert(!skip_block);
+
+ // Process first 8 values which include a dc component.
+ {
+ // Only the first element of each vector is DC.
+ const int16x8_t zbin = vld1q_s16(zbin_ptr);
+ const int16x8_t round = vld1q_s16(round_ptr);
+ const int16x8_t quant = vld1q_s16(quant_ptr);
+ const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ const int16x8_t dequant = vld1q_s16(dequant_ptr);
+ // Add one because the eob does not index from 0.
+ const uint16x8_t iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ const int16x8_t zbin_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+ qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+ coeff_ptr += 8;
+ iscan_ptr += 8;
+
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ qcoeff_ptr += 8;
+
+ qcoeff = vmulq_s16(qcoeff, dequant);
+
+ store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+ dqcoeff_ptr += 8;
+ }
+
+ n_coeffs -= 8;
+
+ {
+ const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
+ const int16x8_t round = vdupq_n_s16(round_ptr[1]);
+ const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+ const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+ const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+
+ do {
+ // Add one because the eob is not its index.
+ const uint16x8_t iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ const int16x8_t zbin_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+ qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+ coeff_ptr += 8;
+ iscan_ptr += 8;
+
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ qcoeff_ptr += 8;
+
+ qcoeff = vmulq_s16(qcoeff, dequant);
+
+ store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+ dqcoeff_ptr += 8;
+
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+ }
+
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+// Main difference is that zbin values are halved before comparison and dqcoeff
+// values are divided by 2. zbin is rounded but dqcoeff is not.
+void vpx_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ int i;
+ (void)scan_ptr;
+ (void)n_coeffs; // Because we will always calculate 32*32.
+ (void)skip_block;
+ assert(!skip_block);
+
+ // Process first 8 values which include a dc component.
+ {
+ // Only the first element of each vector is DC.
+ const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+ const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+ const int16x8_t quant = vld1q_s16(quant_ptr);
+ const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ const int16x8_t dequant = vld1q_s16(dequant_ptr);
+ // Add one because the eob does not index from 0.
+ const uint16x8_t iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ const int16x8_t zbin_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+ int16x8_t dqcoeff;
+ int32x4_t dqcoeff_0, dqcoeff_1;
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+ qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+ coeff_ptr += 8;
+ iscan_ptr += 8;
+
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ qcoeff_ptr += 8;
+
+ dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff =
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+ store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+ dqcoeff_ptr += 8;
+ }
+
+ {
+ const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
+ const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
+ const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+ const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+ const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+
+ for (i = 1; i < 32 * 32 / 8; ++i) {
+ // Add one because the eob is not its index.
+ const uint16x8_t iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ const int16x8_t zbin_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+ int16x8_t dqcoeff;
+ int32x4_t dqcoeff_0, dqcoeff_1;
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+ qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+ coeff_ptr += 8;
+ iscan_ptr += 8;
+
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+ qcoeff_ptr += 8;
+
+ dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff =
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+ store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+ dqcoeff_ptr += 8;
+ }
+ }
+
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+}
diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c
index dc2039800..b04de3aff 100644
--- a/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -13,212 +13,230 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
- const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo =
- vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi =
- vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
- const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
-
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
- const uint8x16_t vec_src_16,
- const uint8x16_t vec_src_32,
- const uint8x16_t vec_src_48, const uint8_t *ref,
- uint16x8_t *vec_sum_ref_lo,
- uint16x8_t *vec_sum_ref_hi) {
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
- const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
- const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
- vget_low_u8(vec_ref_32));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
- vget_high_u8(vec_ref_32));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
- vget_low_u8(vec_ref_48));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
- vget_high_u8(vec_ref_48));
-}
-
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
- const uint8x16_t vec_src_16, const uint8_t *ref,
- uint16x8_t *vec_sum_ref_lo,
- uint16x8_t *vec_sum_ref_hi) {
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
+void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride);
+ for (i = 0; i < 4; ++i) {
+ const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride);
+ uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
+ abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+ res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+ }
}
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride);
+ const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride);
+ for (i = 0; i < 4; ++i) {
+ const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride);
+ const uint8x16_t ref_1 =
+ load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride);
+ uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0));
+ abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0));
+ abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1));
+ abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1));
+ res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+ }
+}
+
+static INLINE void sad8x_4d(const uint8_t *a, int a_stride,
+ const uint8_t *const b[4], int b_stride,
+ uint32_t *result, const int height) {
+ int i, j;
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t a_u8 = vld1_u8(a);
+ a += a_stride;
+ for (j = 0; j < 4; ++j) {
+ const uint8x8_t b_u8 = vld1_u8(b_loop[j]);
+ b_loop[j] += b_stride;
+ sum[j] = vabal_u8(sum[j], a_u8, b_u8);
+ }
+ }
+
+ for (j = 0; j < 4; ++j) {
+ result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+ }
+}
+
+void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad8x_4d(src, src_stride, ref, ref_stride, res, 4);
+}
+
+void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad8x_4d(src, src_stride, ref, ref_stride, res, 8);
+}
+
+void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
+}
+
+static INLINE void sad16x_4d(const uint8_t *a, int a_stride,
+ const uint8_t *const b[4], int b_stride,
+ uint32_t *result, const int height) {
+ int i, j;
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_u8 = vld1q_u8(a);
+ a += a_stride;
+ for (j = 0; j < 4; ++j) {
+ const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);
+ b_loop[j] += b_stride;
+ sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));
+ sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));
+ }
+ }
+
+ for (j = 0; j < 4; ++j) {
+ result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+ }
+}
+
+void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
+}
+
+void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- int i;
- uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
- const uint8_t *ref0, *ref1, *ref2, *ref3;
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
-
- for (i = 0; i < 64; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
- const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
- const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
- &vec_sum_ref0_lo, &vec_sum_ref0_hi);
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
- &vec_sum_ref1_lo, &vec_sum_ref1_hi);
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
- &vec_sum_ref2_lo, &vec_sum_ref2_hi);
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
- &vec_sum_ref3_lo, &vec_sum_ref3_hi);
-
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
+ sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
+}
+
+void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
+
+static INLINE void sad32x_4d(const uint8_t *a, int a_stride,
+ const uint8_t *const b[4], int b_stride,
+ uint32_t *result, const int height) {
+ int i, j;
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(a);
+ const uint8x16_t a_1 = vld1q_u8(a + 16);
+ a += a_stride;
+ for (j = 0; j < 4; ++j) {
+ const uint8x16_t b_0 = vld1q_u8(b_loop[j]);
+ const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16);
+ b_loop[j] += b_stride;
+ sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0));
+ sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0));
+ sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1));
+ sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1));
+ }
}
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+ for (j = 0; j < 4; ++j) {
+ result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+ }
+}
+
+void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad32x_4d(src, src_stride, ref, ref_stride, res, 16);
}
void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
+ sad32x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
+
+void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad32x_4d(src, src_stride, ref, ref_stride, res, 64);
+}
+
+static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1,
+ const uint8x16_t b_0, const uint8x16_t b_1,
+ uint16x8_t *sum) {
+ *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0));
+ *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0));
+ *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1));
+ *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1));
+}
+
+static INLINE void sad64x_4d(const uint8_t *a, int a_stride,
+ const uint8_t *const b[4], int b_stride,
+ uint32_t *result, const int height) {
int i;
- uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
- const uint8_t *ref0, *ref1, *ref2, *ref3;
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
-
- for (i = 0; i < 32; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-
- sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
- &vec_sum_ref0_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
- &vec_sum_ref1_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
- &vec_sum_ref2_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
- &vec_sum_ref3_hi);
-
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
+ uint16x8_t sum_0 = vdupq_n_u16(0);
+ uint16x8_t sum_1 = vdupq_n_u16(0);
+ uint16x8_t sum_2 = vdupq_n_u16(0);
+ uint16x8_t sum_3 = vdupq_n_u16(0);
+ uint16x8_t sum_4 = vdupq_n_u16(0);
+ uint16x8_t sum_5 = vdupq_n_u16(0);
+ uint16x8_t sum_6 = vdupq_n_u16(0);
+ uint16x8_t sum_7 = vdupq_n_u16(0);
+ const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(a);
+ const uint8x16_t a_1 = vld1q_u8(a + 16);
+ const uint8x16_t a_2 = vld1q_u8(a + 32);
+ const uint8x16_t a_3 = vld1q_u8(a + 48);
+ a += a_stride;
+ sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0);
+ sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48),
+ &sum_1);
+ b_loop[0] += b_stride;
+ sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2);
+ sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48),
+ &sum_3);
+ b_loop[1] += b_stride;
+ sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4);
+ sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48),
+ &sum_5);
+ b_loop[2] += b_stride;
+ sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6);
+ sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48),
+ &sum_7);
+ b_loop[3] += b_stride;
}
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+ result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0);
+ result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0);
+ result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0);
+ result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0);
}
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- int i;
- uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
- const uint8_t *ref0, *ref1, *ref2, *ref3;
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
-
- for (i = 0; i < 16; ++i) {
- const uint8x16_t vec_src = vld1q_u8(src);
- const uint8x16_t vec_ref0 = vld1q_u8(ref0);
- const uint8x16_t vec_ref1 = vld1q_u8(ref1);
- const uint8x16_t vec_ref2 = vld1q_u8(ref2);
- const uint8x16_t vec_ref3 = vld1q_u8(ref3);
-
- vec_sum_ref0_lo =
- vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
- vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref0));
- vec_sum_ref1_lo =
- vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
- vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref1));
- vec_sum_ref2_lo =
- vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
- vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref2));
- vec_sum_ref3_lo =
- vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
- vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref3));
-
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
- }
+ sad64x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
+ sad64x_4d(src, src_stride, ref, ref_stride, res, 64);
}
diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c
index ff3228768..9518a166b 100644
--- a/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/libvpx/vpx_dsp/arm/sad_neon.c
@@ -13,211 +13,332 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
-unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
- unsigned char *ref_ptr, int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
+uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
+ abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
+ const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+ uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
+ abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
int i;
+ uint16x8_t abs = vdupq_n_u16(0);
+ for (i = 0; i < 8; i += 4) {
+ const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ src_ptr += 4 * src_stride;
+ ref_ptr += 4 * ref_stride;
+ abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
+ abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+ }
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
- for (i = 0; i < 15; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
+uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ int i;
+ uint16x8_t abs = vdupq_n_u16(0);
+ for (i = 0; i < 8; i += 4) {
+ const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
+ const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+ src_ptr += 4 * src_stride;
+ ref_ptr += 4 * ref_stride;
+ second_pred += 16;
+ abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
+ abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
}
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, const int height) {
+ int i;
+ uint16x8_t abs = vdupq_n_u16(0);
- return vget_lane_u32(d5, 0);
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t a_u8 = vld1_u8(a);
+ const uint8x8_t b_u8 = vld1_u8(b);
+ a += a_stride;
+ b += b_stride;
+ abs = vabal_u8(abs, a_u8, b_u8);
+ }
+ return abs;
}
-unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
- unsigned char *ref_ptr, int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x2_t d1;
- uint64x1_t d3;
+static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *c, const int height) {
int i;
+ uint16x8_t abs = vdupq_n_u16(0);
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t a_u8 = vld1_u8(a);
+ const uint8x8_t b_u8 = vld1_u8(b);
+ const uint8x8_t c_u8 = vld1_u8(c);
+ const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+ a += a_stride;
+ b += b_stride;
+ c += 8;
+ abs = vabal_u8(abs, a_u8, avg);
+ }
+ return abs;
+}
- for (i = 0; i < 3; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
+#define sad8xN(n) \
+ uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint16x8_t abs = \
+ sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
}
- d1 = vpaddl_u16(vget_low_u16(q12));
- d3 = vpaddl_u32(d1);
+sad8xN(4);
+sad8xN(8);
+sad8xN(16);
+
+static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const int height) {
+ int i;
+ uint16x8_t abs = vdupq_n_u16(0);
- return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_u8 = vld1q_u8(a);
+ const uint8x16_t b_u8 = vld1q_u8(b);
+ a += a_stride;
+ b += b_stride;
+ abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
+ abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
+ }
+ return abs;
}
-unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
- unsigned char *ref_ptr, int ref_stride) {
- uint8x16_t q0, q4;
- uint16x8_t q12, q13;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
+static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *c, const int height) {
int i;
+ uint16x8_t abs = vdupq_n_u16(0);
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
- for (i = 0; i < 7; i++) {
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
- }
-
- q12 = vaddq_u16(q12, q13);
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_u8 = vld1q_u8(a);
+ const uint8x16_t b_u8 = vld1q_u8(b);
+ const uint8x16_t c_u8 = vld1q_u8(c);
+ const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+ a += a_stride;
+ b += b_stride;
+ c += 16;
+ abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
+ abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
+ }
+ return abs;
}
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
- const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo =
- vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi =
- vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
- const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
- const uint32x4_t a = vpaddlq_u16(vec_16x8);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
+#define sad16xN(n) \
+ uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint16x8_t abs = \
+ sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ }
+
+sad16xN(8);
+sad16xN(16);
+sad16xN(32);
-unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
+static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const int height) {
int i;
- uint16x8_t vec_accum_lo = vdupq_n_u16(0);
- uint16x8_t vec_accum_hi = vdupq_n_u16(0);
- for (i = 0; i < 64; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
- const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
- const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
- const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
- const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
- src += src_stride;
- ref += ref_stride;
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
- vget_low_u8(vec_ref_32));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
- vget_high_u8(vec_ref_32));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
- vget_low_u8(vec_ref_48));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
- vget_high_u8(vec_ref_48));
- }
- return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+ uint16x8_t abs = vdupq_n_u16(0);
+
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_lo = vld1q_u8(a);
+ const uint8x16_t a_hi = vld1q_u8(a + 16);
+ const uint8x16_t b_lo = vld1q_u8(b);
+ const uint8x16_t b_hi = vld1q_u8(b + 16);
+ a += a_stride;
+ b += b_stride;
+ abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
+ abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
+ abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
+ abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
+ }
+ return abs;
}
-unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
+static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *c, const int height) {
int i;
- uint16x8_t vec_accum_lo = vdupq_n_u16(0);
- uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
- for (i = 0; i < 32; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
- src += src_stride;
- ref += ref_stride;
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
- }
- return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+ uint16x8_t abs = vdupq_n_u16(0);
+
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_lo = vld1q_u8(a);
+ const uint8x16_t a_hi = vld1q_u8(a + 16);
+ const uint8x16_t b_lo = vld1q_u8(b);
+ const uint8x16_t b_hi = vld1q_u8(b + 16);
+ const uint8x16_t c_lo = vld1q_u8(c);
+ const uint8x16_t c_hi = vld1q_u8(c + 16);
+ const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
+ const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+ a += a_stride;
+ b += b_stride;
+ c += 32;
+ abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
+ abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
+ abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
+ abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
+ }
+ return abs;
}
-unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
+#define sad32xN(n) \
+ uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint16x8_t abs = \
+ sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ }
+
+sad32xN(16);
+sad32xN(32);
+sad32xN(64);
+
+static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const int height) {
int i;
- uint16x8_t vec_accum_lo = vdupq_n_u16(0);
- uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
- for (i = 0; i < 16; ++i) {
- const uint8x16_t vec_src = vld1q_u8(src);
- const uint8x16_t vec_ref = vld1q_u8(ref);
- src += src_stride;
- ref += ref_stride;
- vec_accum_lo =
- vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
- vec_accum_hi =
- vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
- }
- return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+ uint16x8_t abs_0 = vdupq_n_u16(0);
+ uint16x8_t abs_1 = vdupq_n_u16(0);
+
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(a);
+ const uint8x16_t a_1 = vld1q_u8(a + 16);
+ const uint8x16_t a_2 = vld1q_u8(a + 32);
+ const uint8x16_t a_3 = vld1q_u8(a + 48);
+ const uint8x16_t b_0 = vld1q_u8(b);
+ const uint8x16_t b_1 = vld1q_u8(b + 16);
+ const uint8x16_t b_2 = vld1q_u8(b + 32);
+ const uint8x16_t b_3 = vld1q_u8(b + 48);
+ a += a_stride;
+ b += b_stride;
+ abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
+ abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
+ abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
+ abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
+ abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
+ abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
+ abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
+ abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
+ }
+
+ {
+ const uint32x4_t sum = vpaddlq_u16(abs_0);
+ return vpadalq_u16(sum, abs_1);
+ }
}
-unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
+static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *c, const int height) {
int i;
- uint16x8_t vec_accum = vdupq_n_u16(0);
+ uint16x8_t abs_0 = vdupq_n_u16(0);
+ uint16x8_t abs_1 = vdupq_n_u16(0);
- for (i = 0; i < 8; ++i) {
- const uint8x8_t vec_src = vld1_u8(src);
- const uint8x8_t vec_ref = vld1_u8(ref);
- src += src_stride;
- ref += ref_stride;
- vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(a);
+ const uint8x16_t a_1 = vld1q_u8(a + 16);
+ const uint8x16_t a_2 = vld1q_u8(a + 32);
+ const uint8x16_t a_3 = vld1q_u8(a + 48);
+ const uint8x16_t b_0 = vld1q_u8(b);
+ const uint8x16_t b_1 = vld1q_u8(b + 16);
+ const uint8x16_t b_2 = vld1q_u8(b + 32);
+ const uint8x16_t b_3 = vld1q_u8(b + 48);
+ const uint8x16_t c_0 = vld1q_u8(c);
+ const uint8x16_t c_1 = vld1q_u8(c + 16);
+ const uint8x16_t c_2 = vld1q_u8(c + 32);
+ const uint8x16_t c_3 = vld1q_u8(c + 48);
+ const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
+ const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
+ const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
+ const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+ a += a_stride;
+ b += b_stride;
+ c += 64;
+ abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
+ abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
+ abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
+ abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
+ abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
+ abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
+ abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
+ abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
+ }
+
+ {
+ const uint32x4_t sum = vpaddlq_u16(abs_0);
+ return vpadalq_u16(sum, abs_1);
}
- return horizontal_add_16x8(vec_accum);
}
+
+#define sad64xN(n) \
+ uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x4_t abs = \
+ sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
+ }
+
+sad64xN(32);
+sad64xN(64);
diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index 9b1622ff0..4f58a7832 100644
--- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -12,16 +12,39 @@
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
static const uint8_t bilinear_filters[8][2] = {
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ const uint8_t *filter) {
+ const uint8x8_t f0 = vdup_n_u8(filter[0]);
+ const uint8x8_t f1 = vdup_n_u8(filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; i += 2) {
+ const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
+ const uint8x8_t src_1 =
+ load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(output_ptr, out);
+ src_ptr += 2 * src_pixels_per_line;
+ output_ptr += 8;
+ }
+}
+
// Process a block exactly 8 wide and any height.
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *output_ptr,
@@ -29,8 +52,8 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
int pixel_step,
unsigned int output_height,
const uint8_t *filter) {
- const uint8x8_t f0 = vmov_n_u8(filter[0]);
- const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ const uint8x8_t f0 = vdup_n_u8(filter[0]);
+ const uint8x8_t f1 = vdup_n_u8(filter[1]);
unsigned int i;
for (i = 0; i < output_height; ++i) {
const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -38,8 +61,7 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
const uint16x8_t a = vmull_u8(src_0, f0);
const uint16x8_t b = vmlal_u8(a, src_1, f1);
const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(&output_ptr[0], out);
- // Next row...
+ vst1_u8(output_ptr, out);
src_ptr += src_pixels_per_line;
output_ptr += 8;
}
@@ -53,8 +75,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
- const uint8x8_t f0 = vmov_n_u8(filter[0]);
- const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ const uint8x8_t f0 = vdup_n_u8(filter[0]);
+ const uint8x8_t f1 = vdup_n_u8(filter[1]);
unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 16) {
@@ -66,36 +88,43 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
- vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+ vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
}
- // Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
}
-// TODO(johannkoenig): support 4xM block sizes.
-#define sub_pixel_varianceNxM(n, m) \
- unsigned int vpx_sub_pixel_variance##n##x##m##_neon( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse) { \
- DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]); \
- DECLARE_ALIGNED(16, uint8_t, temp2[n * m]); \
- \
- if (n == 8) { \
- var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1), \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_w8(fdata3, temp2, n, n, m, \
- bilinear_filters[yoffset]); \
- } else { \
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n, \
- bilinear_filters[yoffset]); \
- } \
- return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse); \
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_varianceNxM(n, m) \
+ uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
+ uint8_t temp1[n * m]; \
+ \
+ if (n == 4) { \
+ var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else if (n == 8) { \
+ var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else { \
+ var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
+ bilinear_filters[yoffset]); \
+ } \
+ return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \
}
+sub_pixel_varianceNxM(4, 4);
+sub_pixel_varianceNxM(4, 8);
sub_pixel_varianceNxM(8, 4);
sub_pixel_varianceNxM(8, 8);
sub_pixel_varianceNxM(8, 16);
@@ -107,3 +136,49 @@ sub_pixel_varianceNxM(32, 32);
sub_pixel_varianceNxM(32, 64);
sub_pixel_varianceNxM(64, 32);
sub_pixel_varianceNxM(64, 64);
+
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_avg_varianceNxM(n, m) \
+ uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
+ uint8_t temp1[n * m]; \
+ \
+ if (n == 4) { \
+ var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else if (n == 8) { \
+ var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else { \
+ var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
+ bilinear_filters[yoffset]); \
+ } \
+ \
+ vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \
+ \
+ return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \
+ }
+
+sub_pixel_avg_varianceNxM(4, 4);
+sub_pixel_avg_varianceNxM(4, 8);
+sub_pixel_avg_varianceNxM(8, 4);
+sub_pixel_avg_varianceNxM(8, 8);
+sub_pixel_avg_varianceNxM(8, 16);
+sub_pixel_avg_varianceNxM(16, 8);
+sub_pixel_avg_varianceNxM(16, 16);
+sub_pixel_avg_varianceNxM(16, 32);
+sub_pixel_avg_varianceNxM(32, 16);
+sub_pixel_avg_varianceNxM(32, 32);
+sub_pixel_avg_varianceNxM(32, 64);
+sub_pixel_avg_varianceNxM(64, 32);
+sub_pixel_avg_varianceNxM(64, 64);
diff --git a/libvpx/vpx_dsp/arm/sum_neon.h b/libvpx/vpx_dsp/arm/sum_neon.h
new file mode 100644
index 000000000..d74fe0cde
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/sum_neon.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_SUM_NEON_H_
+#define VPX_DSP_ARM_SUM_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) {
+ const int32x4_t b = vpaddlq_s16(a);
+ const int64x2_t c = vpaddlq_s32(b);
+ return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+ vreinterpret_s32_s64(vget_high_s64(c)));
+}
+
+static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+}
+
+static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a,
+ const uint16x8_t b) {
+ const uint32x4_t c = vpaddlq_u16(a);
+ const uint32x4_t d = vpadalq_u16(c, b);
+ const uint64x2_t e = vpaddlq_u32(d);
+ return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)),
+ vreinterpret_u32_u64(vget_high_u64(e)));
+}
+
+static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+ const uint64x2_t b = vpaddlq_u32(a);
+ return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+}
+#endif // VPX_DSP_ARM_SUM_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c
index a6b2c53b7..61c2c16a7 100644
--- a/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/libvpx/vpx_dsp/arm/variance_neon.c
@@ -16,23 +16,9 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
#include "vpx_ports/mem.h"
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
- const int32x4_t a = vpaddlq_s16(v_16x8);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
- const int64x2_t b = vpaddlq_s32(v_32x4);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
// The variance helper functions use int16_t for sum. 8 values are accumulated
// and then added (at which point they expand up to int32_t). To avoid overflow,
// there can be no more than 32767 / 255 ~= 128 values accumulated in each
@@ -79,8 +65,10 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
b += 4 * b_stride;
}
- *sum = horizontal_add_s16x8(sum_s16);
- *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+ *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+ *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+ vaddq_s32(sse_lo_s32, sse_hi_s32))),
+ 0);
}
// Process a block of any size where the width is divisible by 16.
@@ -126,8 +114,10 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
b += b_stride;
}
- *sum = horizontal_add_s16x8(sum_s16);
- *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+ *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+ *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+ vaddq_s32(sse_lo_s32, sse_hi_s32))),
+ 0);
}
// Process a block of width 8 two rows at a time.
@@ -165,8 +155,10 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
i += 2;
} while (i < h);
- *sum = horizontal_add_s16x8(sum_s16);
- *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+ *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+ *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+ vaddq_s32(sse_lo_s32, sse_hi_s32))),
+ 0);
}
void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
index e279d570f..1c2ee5063 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -42,10 +42,11 @@
; r1 int src_stride
; r2 uint8_t *dst
; r3 int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4 ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
; sp[]int w
; sp[]int h
@@ -54,11 +55,11 @@
sub r0, r0, #3 ; adjust for taps
- ldr r5, [sp, #32] ; filter_x
- ldr r6, [sp, #48] ; w
- ldr r7, [sp, #52] ; h
+ ldrd r4, r5, [sp, #32] ; filter, x0_q4
+ add r4, r5, lsl #4
+ ldrd r6, r7, [sp, #52] ; w, h
- vld1.s16 {q0}, [r5] ; filter_x
+ vld1.s16 {q0}, [r4] ; filter
sub r8, r1, r1, lsl #2 ; -src_stride * 3
add r8, r8, #4 ; -src_stride * 3 + 4
@@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz
sub r2, r2, r3, lsl #2 ; reset for store
- ; src[] * filter_x
+ ; src[] * filter
MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz
sub r0, r0, r1
sub r0, r0, r1, lsl #1
- ldr r4, [sp, #32] ; filter_y
- ldr r6, [sp, #40] ; w
- ldr lr, [sp, #44] ; h
+ ldr r4, [sp, #24] ; filter
+ ldr r5, [sp, #36] ; y0_q4
+ add r4, r5, lsl #4
+ ldr r6, [sp, #44] ; w
+ ldr lr, [sp, #48] ; h
- vld1.s16 {q0}, [r4] ; filter_y
+ vld1.s16 {q0}, [r4] ; filter
lsl r1, r1, #1
lsl r3, r3, #1
@@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert
pld [r7]
pld [r4]
- ; src[] * filter_y
+ ; src[] * filter
MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
pld [r7, r1]
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 1386838ee..08ae17dba 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -15,6 +15,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
#include "vpx_ports/mem.h"
// Note:
@@ -29,43 +30,11 @@
// instructions. This optimization is much faster in speed unit test, but slowed
// down the whole decoder by 5%.
-static INLINE void load_8x4(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
- uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3) {
- *s0 = vld1_u8(s);
- s += p;
- *s1 = vld1_u8(s);
- s += p;
- *s2 = vld1_u8(s);
- s += p;
- *s3 = vld1_u8(s);
-}
-
-static INLINE void load_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
- uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3,
- uint8x8_t *s4, uint8x8_t *s5, uint8x8_t *s6,
- uint8x8_t *s7) {
- *s0 = vld1_u8(s);
- s += p;
- *s1 = vld1_u8(s);
- s += p;
- *s2 = vld1_u8(s);
- s += p;
- *s3 = vld1_u8(s);
- s += p;
- *s4 = vld1_u8(s);
- s += p;
- *s5 = vld1_u8(s);
- s += p;
- *s6 = vld1_u8(s);
- s += p;
- *s7 = vld1_u8(s);
-}
-
-static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
- const uint8x8_t s1, const uint8x8_t s2,
- const uint8x8_t s3, const uint8x8_t s4,
- const uint8x8_t s5, const uint8x8_t s6,
- const uint8x8_t s7) {
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3,
+ const uint8x8_t s4, const uint8x8_t s5,
+ const uint8x8_t s6, const uint8x8_t s7) {
vst1_u8(s, s0);
s += p;
vst1_u8(s, s1);
@@ -83,53 +52,12 @@ static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
vst1_u8(s, s7);
}
-static INLINE int16x4_t convolve8_4(int16x4_t s0, int16x4_t s1, int16x4_t s2,
- int16x4_t s3, int16x4_t s4, int16x4_t s5,
- int16x4_t s6, int16x4_t s7,
- int16x8_t filters, int16x4_t filter3,
- int16x4_t filter4) {
- const int16x4_t filters_lo = vget_low_s16(filters);
- const int16x4_t filters_hi = vget_high_s16(filters);
- int16x4_t sum = vdup_n_s16(0);
-
- sum = vmla_lane_s16(sum, s0, filters_lo, 0);
- sum = vmla_lane_s16(sum, s1, filters_lo, 1);
- sum = vmla_lane_s16(sum, s2, filters_lo, 2);
- sum = vmla_lane_s16(sum, s5, filters_hi, 1);
- sum = vmla_lane_s16(sum, s6, filters_hi, 2);
- sum = vmla_lane_s16(sum, s7, filters_hi, 3);
- sum = vqadd_s16(sum, vmul_s16(s3, filter3));
- sum = vqadd_s16(sum, vmul_s16(s4, filter4));
- return sum;
-}
-
-static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2,
- int16x8_t s3, int16x8_t s4, int16x8_t s5,
- int16x8_t s6, int16x8_t s7,
- int16x8_t filters, int16x8_t filter3,
- int16x8_t filter4) {
- const int16x4_t filters_lo = vget_low_s16(filters);
- const int16x4_t filters_hi = vget_high_s16(filters);
- int16x8_t sum = vdupq_n_s16(0);
-
- sum = vmlaq_lane_s16(sum, s0, filters_lo, 0);
- sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
- sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
- sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
- sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
- sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
- sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
- sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
- return sum;
-}
-
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w, int h) {
- const int16x8_t filters = vld1q_s16(filter_x);
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
uint8x8_t t0, t1, t2, t3;
assert(!((intptr_t)dst & 3));
@@ -137,8 +65,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16);
(void)x_step_q4;
+ (void)y0_q4;
(void)y_step_q4;
- (void)filter_y;
src -= 3;
@@ -154,7 +82,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 3 * src_stride);
filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -174,7 +102,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
src += 7;
do {
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -224,11 +152,11 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
int width;
const uint8_t *s;
uint8x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
if (w == 4) {
do {
- load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -238,7 +166,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
src += 8 * src_stride;
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(dst + 1 * dst_stride);
@@ -248,7 +177,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(dst + 5 * dst_stride);
__builtin_prefetch(dst + 6 * dst_stride);
__builtin_prefetch(dst + 7 * dst_stride);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -262,19 +191,15 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
filter4);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
filter4);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
filter4);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
filter4);
- t0 = vqrshrun_n_s16(d0, 7);
- t1 = vqrshrun_n_s16(d1, 7);
- t2 = vqrshrun_n_s16(d2, 7);
- t3 = vqrshrun_n_s16(d3, 7);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
dst += dst_stride;
@@ -296,7 +221,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
} while (h > 0);
} else {
uint8_t *d;
- int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
+ int16x8_t s11, s12, s13, s14;
do {
__builtin_prefetch(src + 0 * src_stride);
@@ -307,7 +232,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -330,7 +255,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(dst + 7 * dst_stride);
do {
- load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -341,33 +266,25 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
filter4);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
filter4);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
filter4);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
filter4);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
filter4);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
filter4);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
filter4);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
filter3, filter4);
- t0 = vqrshrun_n_s16(d0, 7);
- t1 = vqrshrun_n_s16(d1, 7);
- t2 = vqrshrun_n_s16(d2, 7);
- t3 = vqrshrun_n_s16(d3, 7);
- t4 = vqrshrun_n_s16(d4, 7);
- t5 = vqrshrun_n_s16(d5, 7);
- t6 = vqrshrun_n_s16(d6, 7);
- t7 = vqrshrun_n_s16(d7, 7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- store_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+ store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
s0 = s8;
s1 = s9;
@@ -390,11 +307,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int16x8_t filters = vld1q_s16(filter_x);
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
uint8x8_t t0, t1, t2, t3;
assert(!((intptr_t)dst & 3));
@@ -402,8 +318,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16);
(void)x_step_q4;
+ (void)y0_q4;
(void)y_step_q4;
- (void)filter_y;
src -= 3;
@@ -420,7 +336,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 3 * src_stride);
filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -440,7 +356,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
src += 7;
do {
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -493,13 +409,13 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
int width;
const uint8_t *s;
uint8x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
if (w == 4) {
uint32x4_t d0415 = vdupq_n_u32(0);
uint32x4_t d2637 = vdupq_n_u32(0);
do {
- load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -509,7 +425,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
src += 8 * src_stride;
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(dst + 1 * dst_stride);
@@ -519,7 +436,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(dst + 5 * dst_stride);
__builtin_prefetch(dst + 6 * dst_stride);
__builtin_prefetch(dst + 7 * dst_stride);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -533,19 +450,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
filter4);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
filter4);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
filter4);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
filter4);
- t0 = vqrshrun_n_s16(d0, 7);
- t1 = vqrshrun_n_s16(d1, 7);
- t2 = vqrshrun_n_s16(d2, 7);
- t3 = vqrshrun_n_s16(d3, 7);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
@@ -581,7 +494,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
} while (h > 0);
} else {
uint8_t *d;
- int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
+ int16x8_t s11, s12, s13, s14;
uint8x16_t d01, d23, d45, d67;
do {
@@ -593,7 +506,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -616,7 +529,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(dst + 7 * dst_stride);
do {
- load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -627,31 +540,23 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
filter4);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
filter4);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
filter4);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
filter4);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
filter4);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
filter4);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
filter4);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
filter3, filter4);
- t0 = vqrshrun_n_s16(d0, 7);
- t1 = vqrshrun_n_s16(d1, 7);
- t2 = vqrshrun_n_s16(d2, 7);
- t3 = vqrshrun_n_s16(d3, 7);
- t4 = vqrshrun_n_s16(d4, 7);
- t5 = vqrshrun_n_s16(d5, 7);
- t6 = vqrshrun_n_s16(d6, 7);
- t7 = vqrshrun_n_s16(d7, 7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
@@ -667,9 +572,9 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
- store_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
- vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
- vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+ store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
+ vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
+ vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
s0 = s8;
s1 = s9;
@@ -692,19 +597,18 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const int16x8_t filters = vld1q_s16(filter_y);
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
assert(!((intptr_t)dst & 3));
assert(!(dst_stride & 3));
assert(y_step_q4 == 16);
+ (void)x0_q4;
(void)x_step_q4;
(void)y_step_q4;
- (void)filter_x;
src -= 3 * src_stride;
@@ -782,7 +686,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
int height;
const uint8_t *s;
uint8_t *d;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ uint8x8_t t0, t1, t2, t3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
do {
__builtin_prefetch(src + 0 * src_stride);
@@ -828,22 +733,22 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
filter4);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
filter4);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
filter4);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
filter4);
- vst1_u8(d, vqrshrun_n_s16(d0, 7));
+ vst1_u8(d, t0);
d += dst_stride;
- vst1_u8(d, vqrshrun_n_s16(d1, 7));
+ vst1_u8(d, t1);
d += dst_stride;
- vst1_u8(d, vqrshrun_n_s16(d2, 7));
+ vst1_u8(d, t2);
d += dst_stride;
- vst1_u8(d, vqrshrun_n_s16(d3, 7));
+ vst1_u8(d, t3);
d += dst_stride;
s0 = s4;
@@ -864,19 +769,18 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const int16x8_t filters = vld1q_s16(filter_y);
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
assert(!((intptr_t)dst & 3));
assert(!(dst_stride & 3));
assert(y_step_q4 == 16);
+ (void)x0_q4;
(void)x_step_q4;
(void)y_step_q4;
- (void)filter_x;
src -= 3 * src_stride;
@@ -963,8 +867,9 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
int height;
const uint8_t *s;
uint8_t *d;
+ uint8x8_t t0, t1, t2, t3;
uint8x16_t d01, d23, dd01, dd23;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
do {
__builtin_prefetch(src + 0 * src_stride);
@@ -1010,17 +915,17 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
filter4);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
filter4);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
filter4);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
filter4);
- d01 = vcombine_u8(vqrshrun_n_s16(d0, 7), vqrshrun_n_s16(d1, 7));
- d23 = vcombine_u8(vqrshrun_n_s16(d2, 7), vqrshrun_n_s16(d3, 7));
+ d01 = vcombine_u8(t0, t1);
+ d23 = vcombine_u8(t2, t3);
dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
vld1_u8(d + 1 * dst_stride));
dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
new file mode 100644
index 000000000..c1634ed55
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filters,
+ const int16x4_t filter3,
+ const int16x4_t filter4) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int16x4_t sum;
+
+ sum = vmul_lane_s16(s0, filters_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+ sum = vqadd_s16(sum, vmul_s16(s3, filter3));
+ sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filters,
+ const int16x8_t filter3,
+ const int16x8_t filter4) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, filters_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
+ sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+ const int16x8_t filters) {
+ const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+ const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+ int16x8_t ss[8];
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+ ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+ ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+ ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+ return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+ filters, filter3, filter4);
+}
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
index 2d0f2ae06..5eee15664 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -42,10 +42,11 @@
; r1 int src_stride
; r2 uint8_t *dst
; r3 int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4 ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
; sp[]int w
; sp[]int h
@@ -54,11 +55,11 @@
sub r0, r0, #3 ; adjust for taps
- ldr r5, [sp, #32] ; filter_x
- ldr r6, [sp, #48] ; w
- ldr r7, [sp, #52] ; h
+ ldrd r4, r5, [sp, #32] ; filter, x0_q4
+ add r4, r5, lsl #4
+ ldrd r6, r7, [sp, #52] ; w, h
- vld1.s16 {q0}, [r5] ; filter_x
+ vld1.s16 {q0}, [r4] ; filter
sub r8, r1, r1, lsl #2 ; -src_stride * 3
add r8, r8, #4 ; -src_stride * 3 + 4
@@ -119,7 +120,7 @@ vpx_convolve8_loop_horiz
pld [r5, r1, lsl #1]
- ; src[] * filter_x
+ ; src[] * filter
MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -173,11 +174,13 @@ vpx_convolve8_loop_horiz
sub r0, r0, r1
sub r0, r0, r1, lsl #1
- ldr r4, [sp, #32] ; filter_y
- ldr r6, [sp, #40] ; w
- ldr lr, [sp, #44] ; h
+ ldr r4, [sp, #24] ; filter
+ ldr r5, [sp, #36] ; y0_q4
+ add r4, r5, lsl #4
+ ldr r6, [sp, #44] ; w
+ ldr lr, [sp, #48] ; h
- vld1.s16 {q0}, [r4] ; filter_y
+ vld1.s16 {q0}, [r4] ; filter
lsl r1, r1, #1
lsl r3, r3, #1
@@ -216,7 +219,7 @@ vpx_convolve8_loop_vert
pld [r5]
pld [r8]
- ; src[] * filter_y
+ ; src[] * filter
MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
pld [r5, r3]
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
index 04cb835fa..07349d03a 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -15,13 +15,13 @@
void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride, int w,
- int h) {
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
if (w < 8) { // avg4
uint8x8_t s0, s1;
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
index 97e6189fd..efd6574f1 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -17,7 +17,7 @@
|vpx_convolve_avg_neon| PROC
push {r4-r6, lr}
- ldrd r4, r5, [sp, #32]
+ ldrd r4, r5, [sp, #36]
mov r6, r2
cmp r4, #32
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
index a8f690acd..7abed67a4 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -15,13 +15,14 @@
void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
if (w < 8) { // copy4
do {
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
index 89164ad48..7a66e3ce2 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -17,7 +17,7 @@
|vpx_convolve_copy_neon| PROC
push {r4-r5, lr}
- ldrd r4, r5, [sp, #28]
+ ldrd r4, r5, [sp, #32]
cmp r4, #32
bgt copy64
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 6ca0e501b..2bf2d890b 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -15,13 +15,13 @@
#include "vpx_ports/mem.h"
void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
/* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
* maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
*/
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+ uint8_t temp[64 * 72];
// Account for the vertical phase needing 3 lines prior and 4 lines post
const int intermediate_height = h + 7;
@@ -33,21 +33,21 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
* height and filter a multiple of 4 lines. Since this goes in to the temp
* buffer which has lots of extra room and is subsequently discarded this is
* safe if somewhat less than ideal. */
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
- x_step_q4, filter_y, y_step_q4, w,
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
intermediate_height);
/* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+ uint8_t temp[64 * 72];
const int intermediate_height = h + 7;
assert(y_step_q4 == 16);
@@ -56,9 +56,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This implementation has the same issues as above. In addition, we only want
* to average the values after both passes.
*/
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
- x_step_q4, filter_y, y_step_q4, w,
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
intermediate_height);
- vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
diff --git a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
new file mode 100644
index 000000000..8edf8a66e
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void scaledconvolve_horiz_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ y = h;
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+ const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+ uint8x8_t s[8], d;
+ int16x8_t ss[4];
+ int16x4_t t[8], tt;
+
+ load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+ transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ t[0] = vget_low_s16(ss[0]);
+ t[1] = vget_low_s16(ss[1]);
+ t[2] = vget_low_s16(ss[2]);
+ t[3] = vget_low_s16(ss[3]);
+ t[4] = vget_high_s16(ss[0]);
+ t[5] = vget_high_s16(ss[1]);
+ t[6] = vget_high_s16(ss[2]);
+ t[7] = vget_high_s16(ss[3]);
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+ filters, filter3, filter4);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ {
+ const uint8x8x4_t d4 = vld4_u8(temp);
+ vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
+ vreinterpret_u32_u8(d4.val[0]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
+ vreinterpret_u32_u8(d4.val[1]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
+ vreinterpret_u32_u8(d4.val[2]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
+ vreinterpret_u32_u8(d4.val[3]), 0);
+ }
+ x += 4;
+ } while (x < w);
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ y -= 4;
+ } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = (h + 7) & ~7;
+
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ uint8x8_t d[8];
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8];
+ load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+ &s[5], &s[6], &s[7]);
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ d[0] = scale_filter_8(s, filters);
+ vst1_u8(&temp[8 * z], d[0]);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ vst1_u8(&dst[x + 0 * dst_stride], d[0]);
+ vst1_u8(&dst[x + 1 * dst_stride], d[1]);
+ vst1_u8(&dst[x + 2 * dst_stride], d[2]);
+ vst1_u8(&dst[x + 3 * dst_stride], d[3]);
+ vst1_u8(&dst[x + 4 * dst_stride], d[4]);
+ vst1_u8(&dst[x + 5 * dst_stride], d[5]);
+ vst1_u8(&dst[x + 6 * dst_stride], d[6]);
+ vst1_u8(&dst[x + 7 * dst_stride], d[7]);
+ x += 8;
+ } while (x < w);
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+ const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+ uint8x8_t s[8], d;
+ int16x4_t t[8], tt;
+
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+ t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+ t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+ t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+ t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+ t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+ t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+ t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
+ filter3, filter4);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ d = scale_filter_8(s, filters);
+ vst1_u8(dst, d);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int x, y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ x = 0;
+ do {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x16_t ss[8];
+ uint8x8_t s[8], d[2];
+ load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+ &ss[5], &ss[6], &ss[7]);
+ s[0] = vget_low_u8(ss[0]);
+ s[1] = vget_low_u8(ss[1]);
+ s[2] = vget_low_u8(ss[2]);
+ s[3] = vget_low_u8(ss[3]);
+ s[4] = vget_low_u8(ss[4]);
+ s[5] = vget_low_u8(ss[5]);
+ s[6] = vget_low_u8(ss[6]);
+ s[7] = vget_low_u8(ss[7]);
+ d[0] = scale_filter_8(s, filters);
+
+ s[0] = vget_high_u8(ss[0]);
+ s[1] = vget_high_u8(ss[1]);
+ s[2] = vget_high_u8(ss[2]);
+ s[3] = vget_high_u8(ss[3]);
+ s[4] = vget_high_u8(ss[4]);
+ s[5] = vget_high_u8(ss[5]);
+ s[6] = vget_high_u8(ss[6]);
+ s[7] = vget_high_u8(ss[7]);
+ d[1] = scale_filter_8(s, filters);
+ vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+ src_y += 16;
+ x += 16;
+ } while (x < w);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
diff --git a/libvpx/vpx_dsp/avg.c b/libvpx/vpx_dsp/avg.c
index e4cd6cca7..a7ac6d953 100644
--- a/libvpx/vpx_dsp/avg.c
+++ b/libvpx/vpx_dsp/avg.c
@@ -34,7 +34,7 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
// src_diff: first pass, 9 bit, dynamic range [-255, 255]
// second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, int src_stride,
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
int16_t *coeff) {
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
@@ -66,7 +66,7 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
// The order of the output coeff of the hadamard is not important. For
// optimization purposes the final transpose may be skipped.
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
int idx;
int16_t buffer[64];
@@ -92,7 +92,7 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
}
// In place 16x16 2D Hadamard transform
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
diff --git a/libvpx/vpx_dsp/deblock.c b/libvpx/vpx_dsp/deblock.c
index 3734ac251..235e85793 100644
--- a/libvpx/vpx_dsp/deblock.c
+++ b/libvpx/vpx_dsp/deblock.c
@@ -9,6 +9,7 @@
*/
#include <assert.h>
#include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
DECLARE_PROTECTED(const int16_t vpx_rv[]) = {
diff --git a/libvpx/vpx_dsp/fwd_txfm.c b/libvpx/vpx_dsp/fwd_txfm.c
index aa5960109..6dcb3ba66 100644
--- a/libvpx/vpx_dsp/fwd_txfm.c
+++ b/libvpx/vpx_dsp/fwd_txfm.c
@@ -84,7 +84,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
for (r = 0; r < 4; ++r)
for (c = 0; c < 4; ++c) sum += input[r * stride + c];
- output[0] = sum << 1;
+ output[0] = sum * 2;
}
void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
diff --git a/libvpx/vpx_dsp/intrapred.c b/libvpx/vpx_dsp/intrapred.c
index 9e2048ebf..400e632e9 100644
--- a/libvpx/vpx_dsp/intrapred.c
+++ b/libvpx/vpx_dsp/intrapred.c
@@ -489,30 +489,39 @@ static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
+ int size;
(void)left;
(void)bd;
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
- above[(r >> 1) + c + 2])
- : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
- }
- dst += stride;
+ for (c = 0; c < bs; ++c) {
+ dst[c] = AVG2(above[c], above[c + 1]);
+ dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+ }
+ for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+ memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst));
+ vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+ memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1),
+ size * sizeof(*dst));
+ vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
}
}
static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
const uint16_t *above,
const uint16_t *left, int bd) {
- int r, c;
+ const uint16_t above_right = above[bs - 1];
+ const uint16_t *const dst_row0 = dst;
+ int x, size;
(void)left;
(void)bd;
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = r + c + 2 < bs * 2
- ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
- : above[bs * 2 - 1];
- }
+
+ for (x = 0; x < bs - 1; ++x) {
+ dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+ }
+ dst[bs - 1] = above_right;
+ dst += stride;
+ for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+ memcpy(dst, dst_row0 + x, size * sizeof(*dst));
+ vpx_memset16(dst + size, above_right, x + 1);
dst += stride;
}
}
diff --git a/libvpx/vpx_dsp/inv_txfm.c b/libvpx/vpx_dsp/inv_txfm.c
index 29323d1b8..0194aa1e1 100644
--- a/libvpx/vpx_dsp/inv_txfm.c
+++ b/libvpx/vpx_dsp/inv_txfm.c
@@ -105,6 +105,7 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
return;
}
+ // 32-bit result is enough for the following multiplications.
s0 = sinpi_1_9 * x0;
s1 = sinpi_2_9 * x0;
s2 = sinpi_3_9 * x1;
@@ -130,16 +131,16 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
}
void idct4_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step[4];
+ int16_t step[4];
tran_high_t temp1, temp2;
// stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
+ temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
+ temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
step[0] = WRAPLOW(dct_const_round_shift(temp1));
step[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
+ temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
step[2] = WRAPLOW(dct_const_round_shift(temp1));
step[3] = WRAPLOW(dct_const_round_shift(temp2));
@@ -177,7 +178,8 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -267,20 +269,20 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
}
void idct8_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[8], step2[8];
+ int16_t step1[8], step2[8];
tran_high_t temp1, temp2;
// stage 1
- step1[0] = input[0];
- step1[2] = input[4];
- step1[1] = input[2];
- step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[0] = (int16_t)input[0];
+ step1[2] = (int16_t)input[4];
+ step1[1] = (int16_t)input[2];
+ step1[3] = (int16_t)input[6];
+ temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
+ temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
step1[4] = WRAPLOW(dct_const_round_shift(temp1));
step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
+ temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
step1[5] = WRAPLOW(dct_const_round_shift(temp1));
step1[6] = WRAPLOW(dct_const_round_shift(temp2));
@@ -373,7 +375,8 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 5);
@@ -552,26 +555,26 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
}
void idct16_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[16], step2[16];
+ int16_t step1[16], step2[16];
tran_high_t temp1, temp2;
// stage 1
- step1[0] = input[0 / 2];
- step1[1] = input[16 / 2];
- step1[2] = input[8 / 2];
- step1[3] = input[24 / 2];
- step1[4] = input[4 / 2];
- step1[5] = input[20 / 2];
- step1[6] = input[12 / 2];
- step1[7] = input[28 / 2];
- step1[8] = input[2 / 2];
- step1[9] = input[18 / 2];
- step1[10] = input[10 / 2];
- step1[11] = input[26 / 2];
- step1[12] = input[6 / 2];
- step1[13] = input[22 / 2];
- step1[14] = input[14 / 2];
- step1[15] = input[30 / 2];
+ step1[0] = (int16_t)input[0 / 2];
+ step1[1] = (int16_t)input[16 / 2];
+ step1[2] = (int16_t)input[8 / 2];
+ step1[3] = (int16_t)input[24 / 2];
+ step1[4] = (int16_t)input[4 / 2];
+ step1[5] = (int16_t)input[20 / 2];
+ step1[6] = (int16_t)input[12 / 2];
+ step1[7] = (int16_t)input[28 / 2];
+ step1[8] = (int16_t)input[2 / 2];
+ step1[9] = (int16_t)input[18 / 2];
+ step1[10] = (int16_t)input[10 / 2];
+ step1[11] = (int16_t)input[26 / 2];
+ step1[12] = (int16_t)input[6 / 2];
+ step1[13] = (int16_t)input[22 / 2];
+ step1[14] = (int16_t)input[14 / 2];
+ step1[15] = (int16_t)input[30 / 2];
// stage 2
step2[0] = step1[0];
@@ -796,7 +799,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -807,64 +811,64 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
}
void idct32_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[32], step2[32];
+ int16_t step1[32], step2[32];
tran_high_t temp1, temp2;
// stage 1
- step1[0] = input[0];
- step1[1] = input[16];
- step1[2] = input[8];
- step1[3] = input[24];
- step1[4] = input[4];
- step1[5] = input[20];
- step1[6] = input[12];
- step1[7] = input[28];
- step1[8] = input[2];
- step1[9] = input[18];
- step1[10] = input[10];
- step1[11] = input[26];
- step1[12] = input[6];
- step1[13] = input[22];
- step1[14] = input[14];
- step1[15] = input[30];
-
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[0] = (int16_t)input[0];
+ step1[1] = (int16_t)input[16];
+ step1[2] = (int16_t)input[8];
+ step1[3] = (int16_t)input[24];
+ step1[4] = (int16_t)input[4];
+ step1[5] = (int16_t)input[20];
+ step1[6] = (int16_t)input[12];
+ step1[7] = (int16_t)input[28];
+ step1[8] = (int16_t)input[2];
+ step1[9] = (int16_t)input[18];
+ step1[10] = (int16_t)input[10];
+ step1[11] = (int16_t)input[26];
+ step1[12] = (int16_t)input[6];
+ step1[13] = (int16_t)input[22];
+ step1[14] = (int16_t)input[14];
+ step1[15] = (int16_t)input[30];
+
+ temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
+ temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
step1[16] = WRAPLOW(dct_const_round_shift(temp1));
step1[31] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
+ temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
step1[17] = WRAPLOW(dct_const_round_shift(temp1));
step1[30] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
+ temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
step1[18] = WRAPLOW(dct_const_round_shift(temp1));
step1[29] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
+ temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
step1[19] = WRAPLOW(dct_const_round_shift(temp1));
step1[28] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
+ temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
step1[20] = WRAPLOW(dct_const_round_shift(temp1));
step1[27] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
+ temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
step1[21] = WRAPLOW(dct_const_round_shift(temp1));
step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
+ temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
step1[22] = WRAPLOW(dct_const_round_shift(temp1));
step1[25] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
+ temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
step1[23] = WRAPLOW(dct_const_round_shift(temp1));
step1[24] = WRAPLOW(dct_const_round_shift(temp2));
@@ -1259,7 +1263,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -1390,13 +1395,13 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
return;
}
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
+ s0 = (tran_high_t)sinpi_1_9 * x0;
+ s1 = (tran_high_t)sinpi_2_9 * x0;
+ s2 = (tran_high_t)sinpi_3_9 * x1;
+ s3 = (tran_high_t)sinpi_4_9 * x2;
+ s4 = (tran_high_t)sinpi_1_9 * x2;
+ s5 = (tran_high_t)sinpi_2_9 * x3;
+ s6 = (tran_high_t)sinpi_4_9 * x3;
s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
s0 = s0 + s3 + s5;
@@ -1428,12 +1433,14 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
}
// stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
+ temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
+ temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ temp1 =
+ input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
+ temp2 =
+ input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
@@ -1473,10 +1480,11 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
for (i = 0; i < 4; i++) {
@@ -1514,14 +1522,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
}
// stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
+ s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
+ s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
+ s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
+ s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
+ s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
+ s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
+ s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
@@ -1537,10 +1545,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
s1 = x1;
s2 = x2;
s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
+ s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
+ s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
+ s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1552,10 +1560,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
// stage 3
- s2 = cospi_16_64 * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (x6 - x7);
+ s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
+ s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+ s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+ s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
@@ -1589,12 +1597,16 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
step1[2] = input[4];
step1[1] = input[2];
step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ temp1 =
+ input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
+ temp2 =
+ input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ temp1 =
+ input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
+ temp2 =
+ input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
@@ -1609,8 +1621,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 3 - odd half
step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[7] = step2[7];
@@ -1681,10 +1693,11 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1728,22 +1741,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
}
// stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+ s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
+ s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
+ s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
+ s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
+ s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
+ s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
+ s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
+ s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
+ s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
+ s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
+ s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
+ s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
+ s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
+ s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
+ s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
+ s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
@@ -1771,14 +1784,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
s5 = x5;
s6 = x6;
s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+ s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
+ s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
+ s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
+ s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
+ s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
+ s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
+ s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
+ s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
@@ -1802,18 +1815,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
s1 = x1;
s2 = x2;
s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
+ s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
+ s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
+ s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
s8 = x8;
s9 = x9;
s10 = x10;
s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+ s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
+ s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
+ s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
+ s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1833,14 +1846,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
// stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
+ s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
+ s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+ s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+ s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
+ s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
+ s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
+ s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
+ s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
@@ -1910,23 +1923,31 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step2[6] = step1[6];
step2[7] = step1[7];
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ temp1 =
+ step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+ temp2 =
+ step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+ step1[14] * (tran_high_t)cospi_18_64;
+ temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+ step1[14] * (tran_high_t)cospi_14_64;
step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+ step1[13] * (tran_high_t)cospi_10_64;
+ temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+ step1[13] * (tran_high_t)cospi_22_64;
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+ step1[12] * (tran_high_t)cospi_26_64;
+ temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+ step1[12] * (tran_high_t)cospi_6_64;
step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
@@ -1936,12 +1957,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step1[2] = step2[2];
step1[3] = step2[3];
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ temp1 =
+ step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+ temp2 =
+ step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ temp1 =
+ step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+ temp2 =
+ step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
@@ -1955,12 +1980,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
// stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ temp1 =
+ step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+ temp2 =
+ step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
@@ -1970,12 +1997,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step2[8] = step1[8];
step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+ step1[14] * (tran_high_t)cospi_24_64;
+ temp2 =
+ step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+ step1[13] * (tran_high_t)cospi_8_64;
+ temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+ step1[13] * (tran_high_t)cospi_24_64;
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[11] = step1[11];
@@ -1987,8 +2018,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[7] = step2[7];
@@ -2013,12 +2044,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
step2[8] = step1[8];
step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[14] = step1[14];
@@ -2126,10 +2157,11 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2169,43 +2201,59 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[14] = input[14];
step1[15] = input[30];
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ temp1 =
+ input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
+ temp2 =
+ input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ temp1 = input[17] * (tran_high_t)cospi_15_64 -
+ input[15] * (tran_high_t)cospi_17_64;
+ temp2 = input[17] * (tran_high_t)cospi_17_64 +
+ input[15] * (tran_high_t)cospi_15_64;
step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ temp1 =
+ input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
+ temp2 =
+ input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ temp1 =
+ input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
+ temp2 =
+ input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ temp1 =
+ input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
+ temp2 =
+ input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ temp1 = input[21] * (tran_high_t)cospi_11_64 -
+ input[11] * (tran_high_t)cospi_21_64;
+ temp2 = input[21] * (tran_high_t)cospi_21_64 +
+ input[11] * (tran_high_t)cospi_11_64;
step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ temp1 = input[13] * (tran_high_t)cospi_19_64 -
+ input[19] * (tran_high_t)cospi_13_64;
+ temp2 = input[13] * (tran_high_t)cospi_13_64 +
+ input[19] * (tran_high_t)cospi_19_64;
step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ temp1 =
+ input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
+ temp2 =
+ input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
@@ -2219,23 +2267,31 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step2[6] = step1[6];
step2[7] = step1[7];
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ temp1 =
+ step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+ temp2 =
+ step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+ step1[14] * (tran_high_t)cospi_18_64;
+ temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+ step1[14] * (tran_high_t)cospi_14_64;
step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+ step1[13] * (tran_high_t)cospi_10_64;
+ temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+ step1[13] * (tran_high_t)cospi_22_64;
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+ step1[12] * (tran_high_t)cospi_26_64;
+ temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+ step1[12] * (tran_high_t)cospi_6_64;
step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
@@ -2262,12 +2318,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[2] = step2[2];
step1[3] = step2[3];
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ temp1 =
+ step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+ temp2 =
+ step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ temp1 =
+ step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+ temp2 =
+ step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
@@ -2282,22 +2342,30 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[16] = step2[16];
step1[31] = step2[31];
- temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
- temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
+ step2[30] * (tran_high_t)cospi_28_64;
+ temp2 = step2[17] * (tran_high_t)cospi_28_64 +
+ step2[30] * (tran_high_t)cospi_4_64;
step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
- temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
+ step2[29] * (tran_high_t)cospi_4_64;
+ temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
+ step2[29] * (tran_high_t)cospi_28_64;
step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[19] = step2[19];
step1[20] = step2[20];
- temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
- temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
+ step2[26] * (tran_high_t)cospi_12_64;
+ temp2 = step2[21] * (tran_high_t)cospi_12_64 +
+ step2[26] * (tran_high_t)cospi_20_64;
step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
- temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
+ step2[25] * (tran_high_t)cospi_20_64;
+ temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
+ step2[25] * (tran_high_t)cospi_12_64;
step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[23] = step2[23];
@@ -2306,12 +2374,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[28] = step2[28];
// stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ temp1 =
+ step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+ temp2 =
+ step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
@@ -2321,12 +2391,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step2[8] = step1[8];
step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+ step1[14] * (tran_high_t)cospi_24_64;
+ temp2 =
+ step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+ step1[13] * (tran_high_t)cospi_8_64;
+ temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+ step1[13] * (tran_high_t)cospi_24_64;
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[11] = step1[11];
@@ -2356,8 +2430,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[7] = step2[7];
@@ -2373,20 +2447,28 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[16] = step2[16];
step1[17] = step2[17];
- temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
- temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
+ step2[29] * (tran_high_t)cospi_24_64;
+ temp2 = step2[18] * (tran_high_t)cospi_24_64 +
+ step2[29] * (tran_high_t)cospi_8_64;
step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
- temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
+ step2[28] * (tran_high_t)cospi_24_64;
+ temp2 = step2[19] * (tran_high_t)cospi_24_64 +
+ step2[28] * (tran_high_t)cospi_8_64;
step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
- temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
+ step2[27] * (tran_high_t)cospi_8_64;
+ temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
+ step2[27] * (tran_high_t)cospi_24_64;
step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
- temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
+ step2[26] * (tran_high_t)cospi_8_64;
+ temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
+ step2[26] * (tran_high_t)cospi_24_64;
step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[22] = step2[22];
@@ -2407,12 +2489,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
step2[8] = step1[8];
step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[14] = step1[14];
@@ -2458,20 +2540,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[17] = step2[17];
step1[18] = step2[18];
step1[19] = step2[19];
- temp1 = (-step2[20] + step2[27]) * cospi_16_64;
- temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = (-step2[21] + step2[26]) * cospi_16_64;
- temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = (-step2[22] + step2[25]) * cospi_16_64;
- temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
- temp1 = (-step2[23] + step2[24]) * cospi_16_64;
- temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[28] = step2[28];
@@ -2603,10 +2685,11 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
int a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
diff --git a/libvpx/vpx_dsp/mips/avg_msa.c b/libvpx/vpx_dsp/mips/avg_msa.c
index 48b841969..d0ac7b8e2 100644
--- a/libvpx/vpx_dsp/mips/avg_msa.c
+++ b/libvpx/vpx_dsp/mips/avg_msa.c
@@ -56,7 +56,8 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
return sum_out;
}
-void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) {
+void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
+ int16_t *dst) {
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -80,7 +81,8 @@ void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) {
ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
}
-void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) {
+void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
+ int16_t *dst) {
v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
diff --git a/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
index ae88eddfd..18e7d5375 100644
--- a/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_y = filter[y0_q4];
uint32_t pos = 38;
assert(y_step_q4 == 16);
@@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
index e944207b6..7dcb662d7 100644
--- a/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
uint32_t pos = 38;
assert(x_step_q4 == 16);
@@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
h);
break;
default:
- vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
index 5cc06b5f2..9e65a8f50 100644
--- a/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
uint32_t pos = 38;
assert(x_step_q4 == 16);
@@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
(int32_t)dst_stride, filter_x, (int32_t)h);
break;
default:
- vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
index eb1975e44..a3e967b40 100644
--- a/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_y = filter[y0_q4];
uint32_t pos = 38;
assert(y_step_q4 == 16);
@@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index b4ed6ee85..d9c2bef69 100644
--- a/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -334,15 +334,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_y = filter[y0_q4];
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000);
if (((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
} else {
uint32_t pos = 38;
@@ -367,8 +368,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
@@ -376,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
/* Fixed size intermediate buffer places limits on parameters. */
DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
@@ -390,24 +391,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
if (intermediate_height < h) intermediate_height = h;
- vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
- x_step_q4, filter_y, y_step_q4, w, intermediate_height);
+ vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
- vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride, int w,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
int x, y;
uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
/* prefetch data to cache memory */
prefetch_load(src);
diff --git a/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index 9a9bab25a..fb68ad881 100644
--- a/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -938,15 +938,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
assert(x_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000);
if (((const int32_t *)filter_x)[0] == 0) {
- vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
} else {
uint32_t pos = 38;
@@ -987,9 +988,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
h);
break;
default:
- vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w,
- h);
+ vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_dspr2.c
index 8d35b6394..89f0f4196 100644
--- a/libvpx/vpx_dsp/mips/convolve8_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
}
void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
uint32_t pos = 38;
@@ -1395,14 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
int x, y;
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
/* prefetch data to cache memory */
prefetch_load(src);
diff --git a/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
index 196a0a2f0..77e95c844 100644
--- a/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
+ const int16_t *const filter_x = filter[x0_q4];
assert(x_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000);
if (((const int32_t *)filter_x)[0] == 0) {
- vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
} else {
uint32_t pos = 38;
@@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
(int32_t)dst_stride, filter_x, (int32_t)h);
break;
default:
- vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
index ad107d5c4..c329f71cc 100644
--- a/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
+ const int16_t *const filter_y = filter[y0_q4];
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000);
if (((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
} else {
uint32_t pos = 38;
@@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
index 4eee3bd5e..48e440d73 100644
--- a/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
@@ -24,21 +24,21 @@ extern "C" {
#if HAVE_DSPR2
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h);
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h);
void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
int w, int h);
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h);
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h);
void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter, int w,
@@ -46,9 +46,9 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h);
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h);
#endif // #if HAVE_DSPR2
#ifdef __cplusplus
diff --git a/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
index f786664bb..5a6dfcef2 100644
--- a/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
@@ -8,8 +8,23 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/fwd_txfm_msa.h"
+void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v4i32 vec_w;
+
+ LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+ ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+ ADD2(in0, in2, in4, in6, in0, in4);
+ vec_w = __msa_hadd_s_w(in0, in0);
+ vec_w += __msa_hadd_s_w(in4, in4);
+ out[0] = HADD_SW_S32(vec_w);
+ out[1] = 0;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
int32_t src_stride) {
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -215,19 +230,6 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
}
-void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v4i32 vec_w;
-
- LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
- ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
- ADD2(in0, in2, in4, in6, in0, in4);
- vec_w = __msa_hadd_s_w(in0, in0);
- vec_w += __msa_hadd_s_w(in4, in4);
- out[0] = HADD_SW_S32(vec_w);
- out[1] = 0;
-}
-
void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
int32_t src_stride) {
int32_t i;
@@ -267,3 +269,4 @@ void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
sum = HADD_SW_S32(vec_w);
out[0] = (int16_t)(sum >> 1);
}
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/libvpx/vpx_dsp/mips/itrans4_dspr2.c
index 3f985b847..e214b538d 100644
--- a/libvpx/vpx_dsp/mips/itrans4_dspr2.c
+++ b/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -343,6 +343,7 @@ void iadst4_dspr2(const int16_t *input, int16_t *output) {
return;
}
+ // 32-bit result is enough for the following multiplications.
s0 = sinpi_1_9 * x0;
s1 = sinpi_2_9 * x0;
s2 = sinpi_3_9 * x1;
diff --git a/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
index b73d56bd5..b1731f234 100644
--- a/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
@@ -8,13 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_ports/mem.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/loopfilter_msa.h"
+#include "vpx_ports/mem.h"
-int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+ uint8_t *filter48,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -77,7 +79,7 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
}
}
-void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
v16u8 flat, flat2, filter8;
v16i8 zero = { 0 };
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -413,11 +415,11 @@ static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
(void)count;
- early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
- limit_ptr, thresh_ptr);
+ early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+ limit_ptr, thresh_ptr);
if (0 == early_exit) {
- vpx_hz_lpf_t16_16w(src, pitch, filter48);
+ hz_lpf_t16_16w(src, pitch, filter48);
}
}
@@ -753,11 +755,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
}
-int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
- uint8_t *src_org, int32_t pitch_org,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+ uint8_t *src_org, int32_t pitch_org,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -820,8 +822,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
}
}
-int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
- uint8_t *filter48) {
+static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+ uint8_t *filter48) {
v16i8 zero = { 0 };
v16u8 filter8, flat, flat2;
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1051,12 +1053,12 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
early_exit =
- vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
- pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+ vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
+ b_limit_ptr, limit_ptr, thresh_ptr);
if (0 == early_exit) {
- early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
- &filter48[0]);
+ early_exit =
+ vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
if (0 == early_exit) {
transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
@@ -1064,11 +1066,11 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
}
}
-int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
- uint8_t *src_org, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+ uint8_t *src_org, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -1141,8 +1143,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
}
}
-int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
- uint8_t *filter48) {
+static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+ uint8_t *filter48) {
v16u8 flat, flat2, filter8;
v16i8 zero = { 0 };
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1473,12 +1475,12 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
early_exit =
- vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
- pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+ vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
if (0 == early_exit) {
- early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
- &filter48[0]);
+ early_exit =
+ vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
if (0 == early_exit) {
transpose_16x16(transposed_input, 16, (src - 8), pitch);
diff --git a/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
index 9500cd2fd..0eff2b6ca 100644
--- a/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/loopfilter_msa.h"
void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
diff --git a/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
index a22c62bb3..703fcce8a 100644
--- a/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/loopfilter_msa.h"
void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h
index 27b38865a..f9a446e7b 100644
--- a/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/libvpx/vpx_dsp/mips/macros_msa.h
@@ -16,207 +16,149 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
+#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
+#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
+
+#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
+#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
+#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
#if (__mips_isa_rev >= 6)
-#define LH(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LH(psrc) \
+ ({ \
+ uint16_t val_lh_m = *(const uint16_t *)(psrc); \
+ val_lh_m; \
})
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LW(psrc) \
+ ({ \
+ uint32_t val_lw_m = *(const uint32_t *)(psrc); \
+ val_lw_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ uint64_t val_ld_m = *(const uint64_t *)(psrc); \
+ val_ld_m; \
})
#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
- \
- val0_m = LW(psrc_m); \
- val1_m = LW(psrc_m + 4); \
- \
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
- \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint32_t val0_ld_m, val1_ld_m; \
+ uint64_t val_ld_m = 0; \
+ \
+ val0_ld_m = LW(psrc_ld_m); \
+ val1_ld_m = LW(psrc_ld_m + 4); \
+ \
+ val_ld_m = (uint64_t)(val1_ld_m); \
+ val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+ val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \
+ \
+ val_ld_m; \
})
#endif // (__mips == 64)
-#define SH(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
-
-#define SW(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
-
-#define SD(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint64_t val_m = (val); \
- \
- __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
+#define SH(val, pdst) *(uint16_t *)(pdst) = (val);
+#define SW(val, pdst) *(uint32_t *)(pdst) = (val);
+#define SD(val, pdst) *(uint64_t *)(pdst) = (val);
#else // !(__mips_isa_rev >= 6)
-#define LH(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LH(psrc) \
+ ({ \
+ const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \
+ uint16_t val_lh_m; \
+ \
+ __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
+ \
+ : [val_lh_m] "=r"(val_lh_m) \
+ : [psrc_lh_m] "m"(*psrc_lh_m)); \
+ \
+ val_lh_m; \
})
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+ uint32_t val_lw_m; \
+ \
+ __asm__ __volatile__("ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
+ \
+ : [val_lw_m] "=r"(val_lw_m) \
+ : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ \
+ val_lw_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint64_t val_ld_m = 0; \
+ \
+ __asm__ __volatile__("uld %[val_ld_m], %[psrc_ld_m] \n\t" \
+ \
+ : [val_ld_m] "=r"(val_ld_m) \
+ : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ \
+ val_ld_m; \
})
#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m_combined = 0; \
- \
- val0_m = LW(psrc_m1); \
- val1_m = LW(psrc_m1 + 4); \
- \
- val_m_combined = (uint64_t)(val1_m); \
- val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
- val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \
- \
- val_m_combined; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint32_t val0_ld_m, val1_ld_m; \
+ uint64_t val_ld_m = 0; \
+ \
+ val0_ld_m = LW(psrc_ld_m); \
+ val1_ld_m = LW(psrc_ld_m + 4); \
+ \
+ val_ld_m = (uint64_t)(val1_ld_m); \
+ val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+ val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \
+ \
+ val_ld_m; \
})
#endif // (__mips == 64)
-#define SH(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_sh_m = (uint8_t *)(pdst); \
+ const uint16_t val_sh_m = (val); \
+ \
+ __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \
+ \
+ : [pdst_sh_m] "=m"(*pdst_sh_m) \
+ : [val_sh_m] "r"(val_sh_m)); \
}
-#define SW(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_sw_m = (uint8_t *)(pdst); \
+ const uint32_t val_sw_m = (val); \
+ \
+ __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \
+ \
+ : [pdst_sw_m] "=m"(*pdst_sw_m) \
+ : [val_sw_m] "r"(val_sw_m)); \
}
-#define SD(val, pdst) \
- { \
- uint8_t *pdst_m1 = (uint8_t *)(pdst); \
- uint32_t val0_m, val1_m; \
- \
- val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
- val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
- \
- SW(val0_m, pdst_m1); \
- SW(val1_m, pdst_m1 + 4); \
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_sd_m = (uint8_t *)(pdst); \
+ uint32_t val0_sd_m, val1_sd_m; \
+ \
+ val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_sd_m, pdst_sd_m); \
+ SW(val1_sd_m, pdst_sd_m + 4); \
}
#endif // (__mips_isa_rev >= 6)
@@ -283,97 +225,73 @@
SD(in3, (pdst) + 3 * stride); \
}
-/* Description : Load vectors with 16 byte elements with stride
+/* Description : Load vector elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
Return Type - as per RTYPE
Details : Load 16 byte elements in 'out0' from (psrc)
Load 16 byte elements in 'out1' from (psrc + stride)
*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+#define LD_V2(RTYPE, psrc, stride, out0, out1) \
{ \
- out0 = LD_B(RTYPE, (psrc)); \
- out1 = LD_B(RTYPE, (psrc) + stride); \
+ out0 = LD_V(RTYPE, (psrc)); \
+ out1 = LD_V(RTYPE, (psrc) + stride); \
}
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
+#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
+#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
{ \
- LD_B2(RTYPE, (psrc), stride, out0, out1); \
- out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
+ LD_V2(RTYPE, (psrc), stride, out0, out1); \
+ out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
}
-#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
- LD_B2(RTYPE, (psrc), stride, out0, out1); \
- LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ LD_V2(RTYPE, (psrc), stride, out0, out1); \
+ LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
}
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
+#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
{ \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
+ LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
}
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
-#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
{ \
- LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
- LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
+ LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
+ LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
}
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
-#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
out7) \
{ \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
- }
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
-
-/* Description : Load vectors with 8 halfword elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Details : Load 8 halfword elements in 'out0' from (psrc)
- Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) \
- { \
- out0 = LD_H(RTYPE, (psrc)); \
- out1 = LD_H(RTYPE, (psrc) + (stride)); \
- }
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
- { \
- LD_H2(RTYPE, (psrc), stride, out0, out1); \
- LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
}
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
+#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
-#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
- out7) \
- { \
- LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
- }
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
out7, out8, out9, out10, out11, out12, out13, out14, out15) \
{ \
- LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \
+ LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \
out7); \
- LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+ LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
out13, out14, out15); \
}
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
/* Description : Load 4x4 block of signed halfword elements from 1D source
data into 4 vectors (Each vector with 4 signed halfwords)
@@ -388,79 +306,35 @@
out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
}
-/* Description : Load 2 vectors of signed word elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) \
- { \
- out0 = LD_SW((psrc)); \
- out1 = LD_SW((psrc) + stride); \
- }
-
-/* Description : Store vectors of 16 byte elements with stride
+/* Description : Store vectors with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 16 byte elements from 'in0' to (pdst)
Store 16 byte elements from 'in1' to (pdst + stride)
*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
- { \
- ST_B(RTYPE, in0, (pdst)); \
- ST_B(RTYPE, in1, (pdst) + stride); \
- }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
- { \
- ST_B2(RTYPE, in0, in1, (pdst), stride); \
- ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
- }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
- { \
- ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
- ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
- }
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 8 halfword elements from 'in0' to (pdst)
- Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+#define ST_V2(RTYPE, in0, in1, pdst, stride) \
{ \
- ST_H(RTYPE, in0, (pdst)); \
- ST_H(RTYPE, in1, (pdst) + stride); \
+ ST_V(RTYPE, in0, (pdst)); \
+ ST_V(RTYPE, in1, (pdst) + stride); \
}
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
+#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
+#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
- ST_H2(RTYPE, in0, in1, (pdst), stride); \
- ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ ST_V2(RTYPE, in0, in1, (pdst), stride); \
+ ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
}
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
+#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
{ \
- ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
- ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
- }
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 4 word elements from 'in0' to (pdst)
- Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) \
- { \
- ST_SW(in0, (pdst)); \
- ST_SW(in1, (pdst) + stride); \
+ ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \
+ ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
}
+#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
+#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
/* Description : Store 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
@@ -681,6 +555,7 @@
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
out3) \
@@ -1308,6 +1183,7 @@
out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
}
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
@@ -1721,6 +1597,25 @@
out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
}
+/* Description : Sign extend byte elements from input vector and return
+ halfword results in pair of vectors
+ Arguments : Input - in (byte vector)
+ Outputs - out0, out1 (sign extended halfword vectors)
+ Return Type - signed halfword
+ Details : Sign bit of byte elements from input vector 'in' is
+ extracted and interleaved right with same vector 'in0' to
+ generate 8 signed halfword elements in 'out0'
+ Then interleaved left with same vector 'in0' to
+ generate 8 signed halfword elements in 'out1'
+*/
+#define UNPCK_SB_SH(in, out0, out1) \
+ { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_b((v16i8)in, 0); \
+ ILVRL_B2_SH(tmp_m, in, out0, out1); \
+ }
+
/* Description : Zero extend unsigned byte elements to halfword elements
Arguments : Input - in (unsigned byte vector)
Outputs - out0, out1 (unsigned halfword vectors)
@@ -1879,8 +1774,6 @@
out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
\
tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
@@ -2034,19 +1927,17 @@
/* Description : Converts inputs to unsigned bytes, interleave, average & store
as 8x4 unsigned byte block
- Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
- pdst, stride
+ Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
*/
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
- pdst, stride) \
- { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- tmp0_m = PCKEV_XORI128_UB(in0, in1); \
- tmp1_m = PCKEV_XORI128_UB(in2, in3); \
- ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = PCKEV_XORI128_UB(in0, in1); \
+ tmp1_m = PCKEV_XORI128_UB(in2, in3); \
+ AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
}
/* Description : Pack even byte elements and store byte vector in destination
diff --git a/libvpx/vpx_dsp/mips/sad_mmi.c b/libvpx/vpx_dsp/mips/sad_mmi.c
new file mode 100644
index 000000000..33bd3fe7f
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define SAD_SRC_REF_ABS_SUB_64 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_32 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_16 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_REF_ABS_SUB_4 \
+ "ulw %[tmp0], 0x00(%[src]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp1] \n\t" \
+ "ulw %[tmp0], 0x00(%[ref]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp2] \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_REF_ABS_SUB_4 \
+ "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" \
+ "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \
+ "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define SAD_SRC_AVGREF_ABS_SUB_64 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_32 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_16 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_AVGREF_ABS_SUB_4 \
+ "ulw %[tmp0], 0x00(%[second_pred]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp1] \n\t" \
+ "ulw %[tmp0], 0x00(%[ref]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp2] \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_AVGREF_ABS_SUB_4 \
+ "gslwlc1 %[ftmp1], 0x03(%[second_pred]) \n\t" \
+ "gslwrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \
+ "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
+#define sadMxNxK_mmi(m, n, k) \
+ void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref_array, int ref_stride, \
+ uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < k; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
+ }
+
+// This appears to be equivalent to the above when k == 4 and refs is const
+#define sadMxNx4D_mmi(m, n) \
+ void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
+ }
+
+static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_64
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_64
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad64xN(H) \
+ unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad64x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad64xN(64);
+vpx_sad64xN(32);
+sadMxNx4D_mmi(64, 64);
+sadMxNx4D_mmi(64, 32);
+
+static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_64
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_64
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg64xN(H) \
+ unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg64xN(64);
+vpx_sad_avg64xN(32);
+
+static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_32
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_32
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad32xN(H) \
+ unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad32x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad32xN(64);
+vpx_sad32xN(32);
+vpx_sad32xN(16);
+sadMxNx4D_mmi(32, 64);
+sadMxNx4D_mmi(32, 32);
+sadMxNx4D_mmi(32, 16);
+
+static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_32
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_32
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg32xN(H) \
+ unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg32xN(64);
+vpx_sad_avg32xN(32);
+vpx_sad_avg32xN(16);
+
+static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_16
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_16
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad16xN(H) \
+ unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad16x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad16xN(32);
+vpx_sad16xN(16);
+vpx_sad16xN(8);
+sadMxNxK_mmi(16, 16, 3);
+sadMxNxK_mmi(16, 16, 8);
+sadMxNxK_mmi(16, 8, 3);
+sadMxNxK_mmi(16, 8, 8);
+sadMxNx4D_mmi(16, 32);
+sadMxNx4D_mmi(16, 16);
+sadMxNx4D_mmi(16, 8);
+
+static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_16
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_16
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg16xN(H) \
+ unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg16xN(32);
+vpx_sad_avg16xN(16);
+vpx_sad_avg16xN(8);
+
+static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_8
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_8
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad8xN(H) \
+ unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad8x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad8xN(16);
+vpx_sad8xN(8);
+vpx_sad8xN(4);
+sadMxNxK_mmi(8, 16, 3);
+sadMxNxK_mmi(8, 16, 8);
+sadMxNxK_mmi(8, 8, 3);
+sadMxNxK_mmi(8, 8, 8);
+sadMxNx4D_mmi(8, 16);
+sadMxNx4D_mmi(8, 8);
+sadMxNx4D_mmi(8, 4);
+
+static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_8
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_8
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg8xN(H) \
+ unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg8xN(16);
+vpx_sad_avg8xN(8);
+vpx_sad_avg8xN(4);
+
+static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_4
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_4
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad4xN(H) \
+ unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad4x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad4xN(8);
+vpx_sad4xN(4);
+sadMxNxK_mmi(4, 4, 3);
+sadMxNxK_mmi(4, 4, 8);
+sadMxNx4D_mmi(4, 8);
+sadMxNx4D_mmi(4, 4);
+
+static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_4
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_4
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg4xN(H) \
+ unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg4xN(8);
+vpx_sad_avg4xN(4);
diff --git a/libvpx/vpx_dsp/mips/sad_msa.c b/libvpx/vpx_dsp/mips/sad_msa.c
index e295123ac..ab681ae9f 100644
--- a/libvpx/vpx_dsp/mips/sad_msa.c
+++ b/libvpx/vpx_dsp/mips/sad_msa.c
@@ -283,96 +283,6 @@ static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
sad_array[2] = HADD_UH_U32(sad2);
}
-static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = height >> 1; ht_cnt--;) {
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
- ref += ref_stride;
-
- sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
- ref += ref_stride;
-
- sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
- v8u16 sad0_0 = { 0 };
- v8u16 sad0_1 = { 0 };
- v8u16 sad1_0 = { 0 };
- v8u16 sad1_1 = { 0 };
- v8u16 sad2_0 = { 0 };
- v8u16 sad2_1 = { 0 };
- v4u32 sad;
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
- ref0_4 = LD_UB(ref + 64);
- ref += ref_stride;
-
- sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
- sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
- sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
- sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad = __msa_hadd_u_w(sad0_0, sad0_0);
- sad += __msa_hadd_u_w(sad0_1, sad0_1);
- sad_array[0] = HADD_SW_S32((v4i32)sad);
-
- sad = __msa_hadd_u_w(sad1_0, sad1_0);
- sad += __msa_hadd_u_w(sad1_1, sad1_1);
- sad_array[1] = HADD_SW_S32((v4i32)sad);
-
- sad = __msa_hadd_u_w(sad2_0, sad2_0);
- sad += __msa_hadd_u_w(sad2_1, sad2_1);
- sad_array[2] = HADD_SW_S32((v4i32)sad);
-}
-
static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t *ref_ptr, int32_t ref_stride,
int32_t height, uint32_t *sad_array) {
@@ -623,176 +533,6 @@ static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
sad_array[7] = HADD_UH_U32(sad7);
}
-static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1;
- v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
- ref += ref_stride;
-
- sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
- sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
- sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
- sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
- sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
- sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- const uint8_t *src_dup, *ref_dup;
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
- v16u8 ref0, ref1, ref2, ref3;
- v8u16 sad0_0 = { 0 };
- v8u16 sad0_1 = { 0 };
- v8u16 sad1_0 = { 0 };
- v8u16 sad1_1 = { 0 };
- v8u16 sad2_0 = { 0 };
- v8u16 sad2_1 = { 0 };
- v8u16 sad3_0 = { 0 };
- v8u16 sad3_1 = { 0 };
- v4u32 sad;
-
- src_dup = src;
- ref_dup = ref;
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
- ref += ref_stride;
-
- sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
- sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
- sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
- sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
- sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad = __msa_hadd_u_w(sad0_0, sad0_0);
- sad += __msa_hadd_u_w(sad0_1, sad0_1);
- sad_array[0] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad1_0, sad1_0);
- sad += __msa_hadd_u_w(sad1_1, sad1_1);
- sad_array[1] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad2_0, sad2_0);
- sad += __msa_hadd_u_w(sad2_1, sad2_1);
- sad_array[2] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad3_0, sad3_0);
- sad += __msa_hadd_u_w(sad3_1, sad3_1);
- sad_array[3] = HADD_SW_S32(sad);
-
- sad0_0 = (v8u16)__msa_ldi_h(0);
- sad0_1 = (v8u16)__msa_ldi_h(0);
- sad1_0 = (v8u16)__msa_ldi_h(0);
- sad1_1 = (v8u16)__msa_ldi_h(0);
- sad2_0 = (v8u16)__msa_ldi_h(0);
- sad2_1 = (v8u16)__msa_ldi_h(0);
- sad3_0 = (v8u16)__msa_ldi_h(0);
- sad3_1 = (v8u16)__msa_ldi_h(0);
-
- for (ht_cnt = 64; ht_cnt--;) {
- LD_UB4(src_dup, 16, src0, src1, src2, src3);
- src_dup += src_stride;
- LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
- ref_dup += ref_stride;
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
- sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
- sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
- sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
- sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad = __msa_hadd_u_w(sad0_0, sad0_0);
- sad += __msa_hadd_u_w(sad0_1, sad0_1);
- sad_array[4] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad1_0, sad1_0);
- sad += __msa_hadd_u_w(sad1_1, sad1_1);
- sad_array[5] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad2_0, sad2_0);
- sad += __msa_hadd_u_w(sad2_1, sad2_1);
- sad_array[6] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad3_0, sad3_0);
- sad += __msa_hadd_u_w(sad3_1, sad3_1);
- sad_array[7] = HADD_SW_S32(sad);
-}
-
static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t height,
@@ -1318,20 +1058,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
}
-#define VPX_SAD_32xHEIGHTx3_MSA(height) \
- void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define VPX_SAD_64xHEIGHTx3_MSA(height) \
- void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
#define VPX_SAD_4xHEIGHTx8_MSA(height) \
void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \
@@ -1353,20 +1079,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
}
-#define VPX_SAD_32xHEIGHTx8_MSA(height) \
- void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define VPX_SAD_64xHEIGHTx8_MSA(height) \
- void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
#define VPX_SAD_4xHEIGHTx4D_MSA(height) \
void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[], \
@@ -1444,43 +1156,31 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
// 64x64
VPX_SAD_64xHEIGHT_MSA(64);
-VPX_SAD_64xHEIGHTx3_MSA(64);
-VPX_SAD_64xHEIGHTx8_MSA(64);
VPX_SAD_64xHEIGHTx4D_MSA(64);
VPX_AVGSAD_64xHEIGHT_MSA(64);
// 64x32
VPX_SAD_64xHEIGHT_MSA(32);
-VPX_SAD_64xHEIGHTx3_MSA(32);
-VPX_SAD_64xHEIGHTx8_MSA(32);
VPX_SAD_64xHEIGHTx4D_MSA(32);
VPX_AVGSAD_64xHEIGHT_MSA(32);
// 32x64
VPX_SAD_32xHEIGHT_MSA(64);
-VPX_SAD_32xHEIGHTx3_MSA(64);
-VPX_SAD_32xHEIGHTx8_MSA(64);
VPX_SAD_32xHEIGHTx4D_MSA(64);
VPX_AVGSAD_32xHEIGHT_MSA(64);
// 32x32
VPX_SAD_32xHEIGHT_MSA(32);
-VPX_SAD_32xHEIGHTx3_MSA(32);
-VPX_SAD_32xHEIGHTx8_MSA(32);
VPX_SAD_32xHEIGHTx4D_MSA(32);
VPX_AVGSAD_32xHEIGHT_MSA(32);
// 32x16
VPX_SAD_32xHEIGHT_MSA(16);
-VPX_SAD_32xHEIGHTx3_MSA(16);
-VPX_SAD_32xHEIGHTx8_MSA(16);
VPX_SAD_32xHEIGHTx4D_MSA(16);
VPX_AVGSAD_32xHEIGHT_MSA(16);
// 16x32
VPX_SAD_16xHEIGHT_MSA(32);
-VPX_SAD_16xHEIGHTx3_MSA(32);
-VPX_SAD_16xHEIGHTx8_MSA(32);
VPX_SAD_16xHEIGHTx4D_MSA(32);
VPX_AVGSAD_16xHEIGHT_MSA(32);
@@ -1514,15 +1214,11 @@ VPX_AVGSAD_8xHEIGHT_MSA(8);
// 8x4
VPX_SAD_8xHEIGHT_MSA(4);
-VPX_SAD_8xHEIGHTx3_MSA(4);
-VPX_SAD_8xHEIGHTx8_MSA(4);
VPX_SAD_8xHEIGHTx4D_MSA(4);
VPX_AVGSAD_8xHEIGHT_MSA(4);
// 4x8
VPX_SAD_4xHEIGHT_MSA(8);
-VPX_SAD_4xHEIGHTx3_MSA(8);
-VPX_SAD_4xHEIGHTx8_MSA(8);
VPX_SAD_4xHEIGHTx4D_MSA(8);
VPX_AVGSAD_4xHEIGHT_MSA(8);
diff --git a/libvpx/vpx_dsp/mips/subtract_mmi.c b/libvpx/vpx_dsp/mips/subtract_mmi.c
new file mode 100644
index 000000000..9f361704a
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/subtract_mmi.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ double ftmp[13];
+ uint32_t tmp[1];
+
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ __asm__ volatile(
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp1] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp2] \n\t"
+#else
+ "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp2], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp2], 0x00(%[pred]) \n\t"
+#endif
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp3] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp4] \n\t"
+#else
+ "gslwlc1 %[ftmp3], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp3], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp4], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp4], 0x00(%[pred]) \n\t"
+#endif
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp5] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp6] \n\t"
+#else
+ "gslwlc1 %[ftmp5], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp6], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[pred]) \n\t"
+#endif
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp8] \n\t"
+#else
+ "gslwlc1 %[ftmp7], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp7], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp8], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp8], 0x00(%[pred]) \n\t"
+#endif
+ "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp2], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp8], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+ [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+ [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+ [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+ [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+#if _MIPS_SIM == _ABIO32
+ [tmp0] "=&r"(tmp[0]),
+#endif
+ [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff)
+ : [src_stride] "r"((mips_reg)src_stride),
+ [pred_stride] "r"((mips_reg)pred_stride),
+ [diff_stride] "r"((mips_reg)(diff_stride * 2))
+ : "memory");
+ break;
+ case 8:
+ __asm__ volatile(
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "li %[tmp0], 0x02 \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp4], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp7], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp8], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ "bnez %[tmp0], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+ [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+ [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+ [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+ [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+ [pred] "+&r"(pred), [diff] "+&r"(diff)
+ : [pred_stride] "r"((mips_reg)pred_stride),
+ [src_stride] "r"((mips_reg)src_stride),
+ [diff_stride] "r"((mips_reg)(diff_stride * 2))
+ : "memory");
+ break;
+ case 16:
+ __asm__ volatile(
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "li %[tmp0], 0x08 \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t"
+ "gsldlc1 %[ftmp3], 0x0f(%[src]) \n\t"
+ "gsldrc1 %[ftmp3], 0x08(%[src]) \n\t"
+ "gsldlc1 %[ftmp4], 0x0f(%[pred]) \n\t"
+ "gsldrc1 %[ftmp4], 0x08(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t"
+ "gsldlc1 %[ftmp7], 0x0f(%[src]) \n\t"
+ "gsldrc1 %[ftmp7], 0x08(%[src]) \n\t"
+ "gsldlc1 %[ftmp8], 0x0f(%[pred]) \n\t"
+ "gsldrc1 %[ftmp8], 0x08(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ "bnez %[tmp0], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+ [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+ [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+ [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+ [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+ [pred] "+&r"(pred), [diff] "+&r"(diff)
+ : [pred_stride] "r"((mips_reg)pred_stride),
+ [src_stride] "r"((mips_reg)src_stride),
+ [diff_stride] "r"((mips_reg)(diff_stride * 2))
+ : "memory");
+ break;
+ case 32:
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+ pred, pred_stride);
+ break;
+ case 64:
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+ pred, pred_stride);
+ break;
+ default:
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+ pred, pred_stride);
+ break;
+ }
+ } else {
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred,
+ pred_stride);
+ }
+}
diff --git a/libvpx/vpx_dsp/mips/variance_mmi.c b/libvpx/vpx_dsp/mips/variance_mmi.c
new file mode 100644
index 000000000..4af60d363
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/variance.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
+ vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
+#define VARIANCE_SSE_SUM_8_FOR_W64 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
+ "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \
+ "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \
+ "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
+ "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \
+ "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
+
+#define VARIANCE_SSE_SUM_4 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \
+ "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
+ "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+
+#define VARIANCE_SSE_SUM_8 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \
+ "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \
+ "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t"
+
+#define VARIANCE_SSE_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+
+#define VARIANCE_SSE_16 \
+ VARIANCE_SSE_8 \
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \
+ /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \
+ /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[3] */ \
+ "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
+ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[3] */ \
+ "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
+ "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
+ "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \
+ "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \
+ "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
+ "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[4] ~ temp2[7] */ \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
+ "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[7] */ \
+ "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "and %[ftmp3], %[ftmp3], %[mask] \n\t" \
+ "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
+ "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \
+ "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[4] ~ temp2[7] */ \
+ "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \
+ "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[7] */ \
+ "and %[ftmp8], %[ftmp8], %[mask] \n\t" \
+ "and %[ftmp9], %[ftmp9], %[mask] \n\t" \
+ "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
+ \
+ /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \
+ "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \
+ "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
+ "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
+ \
+ /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \
+ "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \
+ "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \
+ "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
+ "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
+ \
+ /* calculate: temp2[8] ~ temp2[11] */ \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[12] ~ temp2[15] */ \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[8] ~ temp2[15] */ \
+ "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "and %[ftmp5], %[ftmp5], %[mask] \n\t" \
+ "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
+ \
+ /* calculate: temp2[8] ~ temp2[11] */ \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
+ "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[12] ~ temp2[15] */ \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \
+ "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[8] ~ temp2[15] */ \
+ "and %[ftmp10], %[ftmp10], %[mask] \n\t" \
+ "and %[ftmp11], %[ftmp11], %[mask] \n\t" \
+ "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
+ "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "mfc1 %[tmp1], %[ftmp9] \n\t"
+ "mfhc1 %[tmp2], %[ftmp9] \n\t"
+ "addu %[sum], %[tmp1], %[tmp2] \n\t"
+ "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [tmp2]"=&r"(tmp[2]),
+ [a]"+&r"(a), [b]"+&r"(b),
+ [sum]"=&r"(sum)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (64 * high));
+}
+
+#define VPX_VARIANCE64XN(n) \
+ uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE64XN(64)
+VPX_VARIANCE64XN(32)
+
+uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, uint32_t *sse) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "li %[tmp0], 0x40 \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "mfc1 %[tmp1], %[ftmp9] \n\t"
+ "mfhc1 %[tmp2], %[ftmp9] \n\t"
+ "addu %[sum], %[tmp1], %[tmp2] \n\t"
+ "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [tmp2]"=&r"(tmp[2]),
+ [a]"+&r"(a), [b]"+&r"(b),
+ [sum]"=&r"(sum)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / 2048);
+}
+
+static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (32 * high));
+}
+
+#define VPX_VARIANCE32XN(n) \
+ uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE32XN(32)
+VPX_VARIANCE32XN(16)
+
+static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (16 * high));
+}
+
+#define VPX_VARIANCE16XN(n) \
+ uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE16XN(32)
+VPX_VARIANCE16XN(16)
+VPX_VARIANCE16XN(8)
+
+static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (8 * high));
+}
+
+#define VPX_VARIANCE8XN(n) \
+ uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE8XN(16)
+VPX_VARIANCE8XN(8)
+VPX_VARIANCE8XN(4)
+
+static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_4
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]),
+ [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (4 * high));
+}
+
+#define VPX_VARIANCE4XN(n) \
+ uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE4XN(8)
+VPX_VARIANCE4XN(4)
+
+static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, uint32_t *sse,
+ uint64_t high) {
+ double ftmp[12];
+ uint32_t tmp[1];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+
+ "1: \n\t"
+ VARIANCE_SSE_16
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse;
+}
+
+#define vpx_mse16xN(n) \
+ uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+vpx_mse16xN(16);
+vpx_mse16xN(8);
+
+static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, uint32_t *sse,
+ uint64_t high) {
+ double ftmp[12];
+ uint32_t tmp[1];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+
+ "1: \n\t"
+ VARIANCE_SSE_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse;
+}
+
+#define vpx_mse8xN(n) \
+ uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+vpx_mse8xN(16);
+vpx_mse8xN(8);
+
+#define SUBPIX_VAR(W, H) \
+ uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR(64, 64)
+SUBPIX_VAR(64, 32)
+SUBPIX_VAR(32, 64)
+SUBPIX_VAR(32, 32)
+SUBPIX_VAR(32, 16)
+SUBPIX_VAR(16, 32)
+
+static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ uint8_t *temp2, int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[15];
+ mips_reg tmp[2];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+ DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+
+ const uint8_t *filter_x = bilinear_filters[xoffset];
+ const uint8_t *filter_y = bilinear_filters[yoffset];
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp14])
+ "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
+ "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
+ "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
+ "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+
+ // fdata3: fdata3[0] ~ fdata3[15]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+
+ // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+ // temp2: temp2[0] ~ temp2[15]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+ // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+ // temp2+16*1: temp2[0] ~ temp2[15]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+
+ "1: \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+ [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+ [counter]"+&r"(l_counter)
+ : [filter_x0] "f"((uint64_t)filter_x[0]),
+ [filter_x1] "f"((uint64_t)filter_x[1]),
+ [filter_y0] "f"((uint64_t)filter_y[0]),
+ [filter_y1] "f"((uint64_t)filter_y[1]),
+ [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+ [mask] "f"(mask)
+ : "memory"
+ );
+}
+
+#define SUBPIX_VAR16XN(H) \
+ uint32_t vpx_sub_pixel_variance16x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp2[16 * H]; \
+ var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \
+ (H - 2) / 2); \
+ \
+ return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR16XN(16)
+SUBPIX_VAR16XN(8)
+
+static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ uint8_t *temp2, int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[15];
+ mips_reg tmp[2];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+ DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ const uint8_t *filter_x = bilinear_filters[xoffset];
+ const uint8_t *filter_y = bilinear_filters[yoffset];
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp14])
+ "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
+ "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
+ "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
+ "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+
+ // fdata3: fdata3[0] ~ fdata3[7]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+
+ // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+ // temp2: temp2[0] ~ temp2[7]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+ // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+ // temp2+8*1: temp2[0] ~ temp2[7]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+
+ "1: \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+ [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+ [counter]"+&r"(l_counter)
+ : [filter_x0] "f"((uint64_t)filter_x[0]),
+ [filter_x1] "f"((uint64_t)filter_x[1]),
+ [filter_y0] "f"((uint64_t)filter_y[0]),
+ [filter_y1] "f"((uint64_t)filter_y[1]),
+ [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+ [mask] "f"(mask)
+ : "memory"
+ );
+}
+
+#define SUBPIX_VAR8XN(H) \
+ uint32_t vpx_sub_pixel_variance8x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp2[8 * H]; \
+ var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \
+ (H - 2) / 2); \
+ \
+ return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR8XN(16)
+SUBPIX_VAR8XN(8)
+SUBPIX_VAR8XN(4)
+
+static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ uint8_t *temp2, int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[7];
+ mips_reg tmp[2];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+ DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ const uint8_t *filter_x = bilinear_filters[xoffset];
+ const uint8_t *filter_y = bilinear_filters[yoffset];
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp6])
+ "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
+ "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
+ "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
+ "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+ // fdata3: fdata3[0] ~ fdata3[3]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+
+ // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+ // temp2: temp2[0] ~ temp2[7]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+ // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+ // temp2+4*1: temp2[0] ~ temp2[7]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+
+ "1: \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),
+ [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
+ : [filter_x0] "f"((uint64_t)filter_x[0]),
+ [filter_x1] "f"((uint64_t)filter_x[1]),
+ [filter_y0] "f"((uint64_t)filter_y[0]),
+ [filter_y1] "f"((uint64_t)filter_y[1]),
+ [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+ [mask] "f"(mask)
+ : "memory"
+ );
+}
+
+#define SUBPIX_VAR4XN(H) \
+ uint32_t vpx_sub_pixel_variance4x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp2[4 * H]; \
+ var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \
+ (H - 2) / 2); \
+ \
+ return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR4XN(8)
+SUBPIX_VAR4XN(4)
+
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
+ \
+ return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \
+ }
+
+SUBPIX_AVG_VAR(64, 64)
+SUBPIX_AVG_VAR(64, 32)
+SUBPIX_AVG_VAR(32, 64)
+SUBPIX_AVG_VAR(32, 32)
+SUBPIX_AVG_VAR(32, 16)
+SUBPIX_AVG_VAR(16, 32)
+SUBPIX_AVG_VAR(16, 16)
+SUBPIX_AVG_VAR(16, 8)
+SUBPIX_AVG_VAR(8, 16)
+SUBPIX_AVG_VAR(8, 8)
+SUBPIX_AVG_VAR(8, 4)
+SUBPIX_AVG_VAR(4, 8)
+SUBPIX_AVG_VAR(4, 4)
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index ad2af2866..187a01342 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -16,8 +16,9 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 dst0, dst1, dst2, dst3, res2, res3;
+ v16u8 dst0 = { 0 }, res;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, res0, res1;
@@ -36,23 +37,23 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
filt0, filt1, filt2, filt3, res0, res1);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
SRARI_H2_SH(res0, res1, FILTER_BITS);
SAT_SH2_SH(res0, res1, 7);
- PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- XORI_B2_128_UB(res2, res3);
- AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+ res = PCKEV_XORI128_UB(res0, res1);
+ res = (v16u8)__msa_aver_u_b(res, dst0);
+ ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
v8i16 filt, vec0, vec1, vec2, vec3;
mask0 = LD_UB(&mc_filt_mask_arr[16]);
@@ -69,7 +70,10 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
filt0, filt1, filt2, filt3, vec0, vec1);
LD_SB4(src, src_stride, src0, src1, src2, src3);
@@ -82,10 +86,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
res3);
ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
XORI_B2_128_UB(res0, res2);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
- ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
- AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+ AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
ST4x8_UB(res0, res2, dst, dst_stride);
}
@@ -105,8 +106,9 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
int32_t dst_stride, int8_t *filter,
int32_t height) {
int32_t loop_cnt;
+ int64_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+ v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 };
v8i16 filt, out0, out1, out2, out3;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
@@ -127,10 +129,12 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1,
out2, out3);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
dst_stride);
dst += (4 * dst_stride);
}
@@ -309,8 +313,9 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+ v16u8 filt0, dst0 = { 0 }, vec0, vec1, res;
v8u16 vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -320,23 +325,24 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, FILTER_BITS);
- PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ res = (v16u8)__msa_aver_u_b(res, dst0);
+ ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
v8u16 vec4, vec5, vec6, vec7, filt;
mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -346,7 +352,10 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
@@ -354,13 +363,9 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
res3);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
- AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
- res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+ ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+ AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+ ST4x8_UB(res0, res2, dst, dst_stride);
}
static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -378,8 +383,9 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
+ int64_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -394,16 +400,18 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
}
static void common_hz_2t_and_aver_dst_8x8mult_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter, int32_t height) {
+ int64_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -419,11 +427,12 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -431,9 +440,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
if (16 == height) {
@@ -445,10 +455,11 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
LD_SB4(src, src_stride, src0, src1, src2, src3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -456,9 +467,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
}
}
@@ -633,9 +645,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
+ const int16_t *const filter_x = filter[x0_q4];
int8_t cnt, filt_hor[8];
assert(x_step_q4 == 16);
@@ -668,8 +681,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
(int32_t)dst_stride, &filt_hor[3], h);
break;
default:
- vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
} else {
@@ -695,8 +708,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
(int32_t)dst_stride, filt_hor, h);
break;
default:
- vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index 1cfa63201..5187cea21 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -16,8 +16,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
@@ -59,7 +60,8 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
XORI_B4_128_SB(src7, src8, src9, src10);
src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
@@ -73,14 +75,12 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
SRARI_H2_SH(res0, res1, FILTER_BITS);
SAT_SH2_SH(res0, res1, 7);
- PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
- XORI_B2_128_UB(tmp0, tmp1);
- AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
- ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+ res = PCKEV_XORI128_UB(res0, res1);
+ res = (v16u8)__msa_aver_u_b(res, dst0);
+ ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out5 = hz_out9;
@@ -94,10 +94,11 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
+ uint64_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
- v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+ v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
@@ -144,7 +145,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
XORI_B4_128_SB(src7, src8, src9, src10);
src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
@@ -172,7 +175,7 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
- CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
+ CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
dst_stride);
dst += (4 * dst_stride);
@@ -225,9 +228,10 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_vert) {
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, mask;
v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 dst0, dst1, dst2, dst3, res0, res1;
+ v16u8 dst0 = { 0 }, out;
v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -248,21 +252,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
- AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, dst0);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_vert) {
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
v8i16 filt;
@@ -289,21 +294,18 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
hz_out3, hz_out5, 8);
hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
- res3);
- AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
- res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
+ AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+ ST4x8_UB(res0, res1, dst, dst_stride);
}
static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
@@ -321,8 +323,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_vert) {
+ uint64_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+ v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
v8i16 filt;
@@ -338,7 +341,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
src += (5 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@@ -357,16 +362,16 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
tmp3 = __msa_dotp_u_h(vec3, filt_vt);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
- dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
}
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
uint32_t loop_cnt;
+ uint64_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+ v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
v8i16 filt;
@@ -407,9 +412,10 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
tmp3 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
- dst_stride);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@@ -516,9 +522,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
int8_t cnt, filt_hor[8], filt_ver[8];
assert(x_step_q4 == 16);
@@ -560,14 +567,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
&filt_hor[3], &filt_ver[3], h);
break;
default:
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
} else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
} else {
switch (w) {
case 4:
@@ -596,8 +603,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
filt_ver, h);
break;
default:
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index 146ce3b2f..ef8c90114 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -17,8 +17,9 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
int32_t dst_stride, int8_t *filter,
int32_t height) {
uint32_t loop_cnt;
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3, out;
+ v16u8 dst0 = { 0 }, out;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
v16i8 src10998, filt0, filt1, filt2, filt3;
@@ -43,7 +44,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
LD_SB4(src, src_stride, src7, src8, src9, src10);
src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
@@ -55,9 +57,6 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
SRARI_H2_SH(out10, out32, FILTER_BITS);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
- dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
out = __msa_aver_u_b(out, dst0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -75,8 +74,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
int32_t dst_stride, int8_t *filter,
int32_t height) {
uint32_t loop_cnt;
+ uint64_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
v8i16 filt, out0, out1, out2, out3;
@@ -98,7 +98,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
LD_SB4(src, src_stride, src7, src8, src9, src10);
src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
XORI_B4_128_SB(src7, src8, src9, src10);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
@@ -112,7 +114,7 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
filt1, filt2, filt3);
SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
dst_stride);
dst += (4 * dst_stride);
@@ -246,8 +248,9 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
v16i8 src0, src1, src2, src3, src4;
- v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+ v16u8 dst0 = { 0 }, out, filt0, src2110, src4332;
v16i8 src10_r, src32_r, src21_r, src43_r;
v8i16 filt;
v8u16 tmp0, tmp1;
@@ -261,9 +264,8 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
src4 = LD_SB(src);
src += src_stride;
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
- dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
src32_r, src43_r);
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
@@ -280,7 +282,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ uint32_t tp0, tp1, tp2, tp3;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
v16u8 src2110, src4332, src6554, src8776, filt0;
@@ -294,10 +297,10 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
src += (8 * src_stride);
src8 = LD_SB(src);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
- dst3);
- ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
src32_r, src43_r);
ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
@@ -309,9 +312,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+ ST4x8_UB(src2110, src4332, dst, dst_stride);
}
static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -329,8 +330,9 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
int32_t src_stride, uint8_t *dst,
int32_t dst_stride,
int8_t *filter) {
+ int64_t tp0, tp1, tp2, tp3;
v16u8 src0, src1, src2, src3, src4;
- v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+ v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0;
v8u16 tmp0, tmp1, tmp2, tmp3;
v8i16 filt;
@@ -339,22 +341,24 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
filt0 = (v16u8)__msa_splati_h(filt, 0);
LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
- dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
}
static void common_vt_2t_and_aver_dst_8x8mult_msa(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter, int32_t height) {
uint32_t loop_cnt;
+ int64_t tp0, tp1, tp2, tp3;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
- v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
v8u16 tmp0, tmp1, tmp2, tmp3;
v8i16 filt;
@@ -369,7 +373,12 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
for (loop_cnt = (height >> 3); loop_cnt--;) {
LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
src += (8 * src_stride);
- LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst2);
+ INSERT_D2_UB(tp2, tp3, dst3);
ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
vec3);
@@ -378,15 +387,13 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
- dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
- dst_stride);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
dst += (4 * dst_stride);
src0 = src8;
@@ -605,9 +612,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
+ const int16_t *const filter_y = filter[y0_q4];
int8_t cnt, filt_ver[8];
assert(y_step_q4 == 16);
@@ -640,8 +648,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
(int32_t)dst_stride, &filt_ver[3], h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
} else {
@@ -668,8 +676,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
(int32_t)dst_stride, filt_ver, h);
break;
default:
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index 9e8bf7b51..152dc2610 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
+ const int16_t *const filter_x = filter[x0_q4];
int8_t cnt, filt_hor[8];
assert(x_step_q4 == 16);
@@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
&filt_hor[3], h);
break;
default:
- vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
} else {
@@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
filt_hor, h);
break;
default:
- vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
index b16ec5788..d35a5a7a6 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
}
void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int32_t x_step_q4, const int16_t *filter_y,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int32_t x_step_q4, int y0_q4,
int32_t y_step_q4, int32_t w, int32_t h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
int8_t cnt, filt_hor[8], filt_ver[8];
assert(x_step_q4 == 16);
@@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
&filt_ver[3], (int32_t)h);
break;
default:
- vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
} else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) {
- vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
} else {
switch (w) {
case 4:
@@ -621,9 +623,605 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
(int32_t)h);
break;
default:
- vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
}
+
+static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *x_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ uint32_t res;
+ v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+ v16i8 out0, out1;
+ v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
+ v16i8 shf2 = shf1 + 2;
+ v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+ v16i8 filt_shf1 = filt_shf0 + 2;
+ v16i8 filt_shf2 = filt_shf0 + 4;
+ v16i8 filt_shf3 = filt_shf0 + 6;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
+
+ LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src0);
+ INSERT_D2_UB(srcd2, srcd3, src1);
+ VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+ XORI_B2_128_SB(out0, out1);
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+
+ filt = LD_SH(x_filter);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+ src0_h *= filt0;
+ src0_h += src1_h * filt1;
+ src0_h += src2_h * filt2;
+ src0_h += src3_h * filt3;
+
+ src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+ src0_h = __msa_adds_s_h(src0_h, src1_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ res = __msa_copy_u_w((v4i32)dst0, 0);
+ SW(res, dst);
+}
+
+static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *x_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+ v16i8 out0, out1, out2, out3;
+ v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+ v16i8 shf2 = shf1 + 4;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+ LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src0);
+ INSERT_D2_UB(srcd2, srcd3, src1);
+ LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src2);
+ INSERT_D2_UB(srcd2, srcd3, src3);
+
+ filt = LD_SH(x_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ // transpose
+ VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+ ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+
+ XORI_B4_128_SB(out0, out1, out2, out3);
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+ UNPCK_SB_SH(out2, src4_h, src5_h);
+ UNPCK_SB_SH(out3, src6_h, src7_h);
+
+ src0_h *= filt0;
+ src4_h *= filt4;
+ src0_h += src1_h * filt1;
+ src4_h += src5_h * filt5;
+ src0_h += src2_h * filt2;
+ src4_h += src6_h * filt6;
+ src0_h += src3_h * filt3;
+ src4_h += src7_h * filt7;
+
+ src0_h = __msa_adds_s_h(src0_h, src4_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ ST8x1_UB(dst0, dst);
+}
+
+static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *x_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
+ v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+ v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
+ v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+ v16i8 shf2 = shf1 + 4;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+ v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
+
+ LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src0);
+ INSERT_D2_UB(srcd2, srcd3, src1);
+ LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src2);
+ INSERT_D2_UB(srcd2, srcd3, src3);
+ LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src4);
+ INSERT_D2_UB(srcd2, srcd3, src5);
+ LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src6);
+ INSERT_D2_UB(srcd2, srcd3, src7);
+
+ filt = LD_SH(x_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ // transpose
+ VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+ ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+ XORI_B4_128_SB(out0, out1, out2, out3);
+
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+ UNPCK_SB_SH(out2, src4_h, src5_h);
+ UNPCK_SB_SH(out3, src6_h, src7_h);
+
+ VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_SB(tmp2, tmp0, out4, out5);
+ ILVRL_W2_SB(tmp3, tmp1, out6, out7);
+ XORI_B4_128_SB(out4, out5, out6, out7);
+
+ dst0_h = src0_h * filt0;
+ dst1_h = src4_h * filt4;
+ dst0_h += src1_h * filt1;
+ dst1_h += src5_h * filt5;
+ dst0_h += src2_h * filt2;
+ dst1_h += src6_h * filt6;
+ dst0_h += src3_h * filt3;
+ dst1_h += src7_h * filt7;
+
+ UNPCK_SB_SH(out4, src0_h, src1_h);
+ UNPCK_SB_SH(out5, src2_h, src3_h);
+ UNPCK_SB_SH(out6, src4_h, src5_h);
+ UNPCK_SB_SH(out7, src6_h, src7_h);
+
+ dst2_h = src0_h * filt0;
+ dst3_h = src4_h * filt4;
+ dst2_h += src1_h * filt1;
+ dst3_h += src5_h * filt5;
+ dst2_h += src2_h * filt2;
+ dst3_h += src6_h * filt6;
+ dst2_h += src3_h * filt3;
+ dst3_h += src7_h * filt7;
+
+ ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
+ SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
+ SAT_SH2_SH(dst0_h, dst2_h, 7);
+ dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
+ ST_UB(dst0, dst);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
+ ptrdiff_t dst_stride) {
+ v16u8 in0;
+ v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+ in0 = LD_UB(src);
+ out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
+ ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
+ ptrdiff_t dst_stride) {
+ v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
+ v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+ v16i8 shf2 = shf1 + 4;
+
+ LD_UB4(src, 16, in0, in1, in2, in3);
+ VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_UB(tmp2, tmp0, out0, out1);
+ ILVRL_W2_UB(tmp3, tmp1, out2, out3);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
+ ptrdiff_t dst_stride) {
+ v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
+ v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
+ v16u8 out9, out10, out11, out12, out13, out14, out15;
+
+ LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, out0, out1, out2, out3,
+ out4, out5, out6, out7);
+ ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
+ dst += 8 * dst_stride;
+
+ SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
+ SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
+ SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
+ SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
+
+ TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, out8, out9, out10, out11,
+ out12, out13, out14, out15);
+ ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int y, z, i;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; y += 4) {
+ int x_q4 = x0_q4;
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
+ } else {
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+
+ x_q4 += x_step_q4;
+ }
+
+ transpose4x4_to_dst(temp, dst, dst_stride);
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ }
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int y, z, i;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = h + (8 - (h & 0x7));
+
+ do {
+ int x_q4 = x0_q4;
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
+ } else {
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[3 + i * src_stride];
+ }
+ }
+
+ x_q4 += x_step_q4;
+ }
+
+ transpose8x8_to_dst(temp, dst, dst_stride);
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
+ int x, y, z, i;
+
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 16x16 areas. The intermediate height is not always
+ // a multiple of 16, so force it to be a multiple of 8 here.
+ y = h + (16 - (h & 0xF));
+
+ do {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 16) {
+ for (z = 0; z < 16; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
+ } else {
+ for (i = 0; i < 16; ++i) {
+ temp[z * 16 + i] = src_x[3 + i * src_stride];
+ }
+ }
+
+ x_q4 += x_step_q4;
+ }
+
+ transpose16x16_to_dst(temp, dst + x, dst_stride);
+ }
+
+ src += src_stride * 16;
+ dst += dst_stride * 16;
+ } while (y -= 16);
+}
+
+static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *y_filter) {
+ uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
+ uint32_t res;
+ v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+ v16i8 out0, out1;
+ v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
+ v16i8 shf2 = shf1 + 8;
+ v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+ v16i8 filt_shf1 = filt_shf0 + 2;
+ v16i8 filt_shf2 = filt_shf0 + 4;
+ v16i8 filt_shf3 = filt_shf0 + 6;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h;
+ v8i16 filt0, filt1, filt2, filt3;
+
+ LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
+ LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
+ INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
+ INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
+ VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+ XORI_B2_128_SB(out0, out1);
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+
+ filt = LD_SH(y_filter);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+ src0_h *= filt0;
+ src0_h += src1_h * filt1;
+ src0_h += src2_h * filt2;
+ src0_h += src3_h * filt3;
+
+ src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+ src0_h = __msa_adds_s_h(src0_h, src1_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ res = __msa_copy_u_w((v4i32)dst0, 0);
+ SW(res, dst);
+}
+
+static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *y_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ v16u8 dst0;
+ v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+ LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_SB(srcd0, srcd1, src0);
+ INSERT_D2_SB(srcd2, srcd3, src1);
+ LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_SB(srcd0, srcd1, src2);
+ INSERT_D2_SB(srcd2, srcd3, src3);
+
+ filt = LD_SH(y_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ UNPCK_SB_SH(src0, src0_h, src1_h);
+ UNPCK_SB_SH(src1, src2_h, src3_h);
+ UNPCK_SB_SH(src2, src4_h, src5_h);
+ UNPCK_SB_SH(src3, src6_h, src7_h);
+
+ src0_h *= filt0;
+ src4_h *= filt4;
+ src0_h += src1_h * filt1;
+ src4_h += src5_h * filt5;
+ src0_h += src2_h * filt2;
+ src4_h += src6_h * filt6;
+ src0_h += src3_h * filt3;
+ src4_h += src7_h * filt7;
+
+ src0_h = __msa_adds_s_h(src0_h, src4_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ ST8x1_UB(dst0, dst);
+}
+
+static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *y_filter,
+ int w) {
+ int x;
+ v16u8 dst0;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+ filt = LD_SH(y_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ for (x = 0; x < w; x += 16) {
+ LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
+ src_y += 16;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ XORI_B4_128_SB(src4, src5, src6, src7);
+ UNPCK_SB_SH(src0, src0_h, src1_h);
+ UNPCK_SB_SH(src1, src2_h, src3_h);
+ UNPCK_SB_SH(src2, src4_h, src5_h);
+ UNPCK_SB_SH(src3, src6_h, src7_h);
+ UNPCK_SB_SH(src4, src8_h, src9_h);
+ UNPCK_SB_SH(src5, src10_h, src11_h);
+ UNPCK_SB_SH(src6, src12_h, src13_h);
+ UNPCK_SB_SH(src7, src14_h, src15_h);
+
+ src0_h *= filt0;
+ src1_h *= filt0;
+ src8_h *= filt4;
+ src9_h *= filt4;
+ src0_h += src2_h * filt1;
+ src1_h += src3_h * filt1;
+ src8_h += src10_h * filt5;
+ src9_h += src11_h * filt5;
+ src0_h += src4_h * filt2;
+ src1_h += src5_h * filt2;
+ src8_h += src12_h * filt6;
+ src9_h += src13_h * filt6;
+ src0_h += src6_h * filt3;
+ src1_h += src7_h * filt3;
+ src8_h += src14_h * filt7;
+ src9_h += src15_h * filt7;
+
+ ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
+ SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
+ SAT_SH2_SH(src0_h, src1_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
+ ST_UB(dst0, dst);
+ dst += 16;
+ }
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ uint32_t srcd = LW(src_y + 3 * src_stride);
+ SW(srcd, dst + y * dst_stride);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ uint64_t srcd = LD(src_y + 3 * src_stride);
+ SD(srcd, dst + y * dst_stride);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ int y_q4 = y0_q4;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
+ w);
+ } else {
+ for (x = 0; x < w; ++x) {
+ dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
+ }
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
+ vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ if (w >= 16) {
+ scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ w, intermediate_height);
+ } else if (w == 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, h);
+ }
+ }
+}
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index 410682271..13fce0077 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
+ const int16_t *const filter_y = filter[y0_q4];
int8_t cnt, filt_ver[8];
assert(y_step_q4 == 16);
@@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
&filt_ver[3], h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
} else {
@@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
filt_ver, h);
break;
default:
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
index 45399bad8..ce649935d 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
@@ -188,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
switch (w) {
case 4: {
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
index c3d87a4ab..c2ab33a2f 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -9,6 +9,7 @@
*/
#include <string.h>
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
@@ -198,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
switch (w) {
case 4: {
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
index f75679521..d53244596 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -110,14 +110,13 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
ST_UB(tmp_m, (pdst)); \
}
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
- stride) \
- { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
- PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \
+#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
}
#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/libvpx/vpx_dsp/ppc/hadamard_vsx.c
index 435e3eb5b..e279b3047 100644
--- a/libvpx/vpx_dsp/ppc/hadamard_vsx.c
+++ b/libvpx/vpx_dsp/ppc/hadamard_vsx.c
@@ -42,7 +42,7 @@ static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
v[7] = vec_add(c1, c5);
}
-void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
int16x8_t v[8];
@@ -71,7 +71,7 @@ void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
store_tran_low(v[7], 0, coeff + 56);
}
-void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
int i;
const uint16x8_t ones = vec_splat_u16(1);
diff --git a/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
new file mode 100644
index 000000000..d43a9fd18
--- /dev/null
+++ b/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -0,0 +1,1063 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+ 16364, 16364, 16364, 16364 };
+static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+ 16305, 16305, 16305, 16305 };
+static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+ 16207, 16207, 16207, 16207 };
+static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+ 16069, 16069, 16069, 16069 };
+static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+ -16069, -16069, -16069, -16069 };
+static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+ 15893, 15893, 15893, 15893 };
+static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+ 15679, 15679, 15679, 15679 };
+static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+ 15426, 15426, 15426, 15426 };
+static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+ 15137, 15137, 15137, 15137 };
+static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+ -15137, -15137, -15137, -15137 };
+static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+ 14811, 14811, 14811, 14811 };
+static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+ 14449, 14449, 14449, 14449 };
+static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+ 14053, 14053, 14053, 14053 };
+static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+ 13623, 13623, 13623, 13623 };
+static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+ 13160, 13160, 13160, 13160 };
+static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+ 12665, 12665, 12665, 12665 };
+static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+ 12140, 12140, 12140, 12140 };
+static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+ 11585, 11585, 11585, 11585 };
+static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+ 11003, 11003, 11003, 11003 };
+static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+ 10394, 10394, 10394, 10394 };
+static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 };
+static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
+static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+ -9102, -9102, -9102, -9102 };
+static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 };
+static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
+static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 };
+static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
+static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
+ -6270, -6270, -6270, -6270 };
+static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 };
+static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
+static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 };
+static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
+static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
+static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
+static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+#define ROUND_SHIFT_INIT \
+ const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
+ const uint32x4_t shift14 = vec_splat_u32(14);
+
+#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
+
+#define PIXEL_ADD_INIT \
+ int16x8_t add8 = vec_splat_s16(8); \
+ uint16x8_t shift4 = vec_splat_u16(4);
+
+#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
+
+#define IDCT4(in0, in1, out0, out1) \
+ t0 = vec_add(in0, in1); \
+ t1 = vec_sub(in0, in1); \
+ tmp16_0 = vec_mergeh(t0, t1); \
+ temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \
+ temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \
+ \
+ tmp16_0 = vec_mergel(in0, in1); \
+ temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp3); \
+ temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
+ DCT_CONST_ROUND_SHIFT(temp4); \
+ \
+ step0 = vec_packs(temp1, temp2); \
+ step1 = vec_packs(temp4, temp3); \
+ out0 = vec_add(step0, step1); \
+ out1 = vec_sub(step0, step1); \
+ out1 = vec_perm(out1, out1, mask0);
+
+void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int32x4_t temp1, temp2, temp3, temp4;
+ int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
+ uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
+ uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
+ int16x8_t v0 = load_tran_low(0, input);
+ int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
+ int16x8_t t0 = vec_mergeh(v0, v1);
+ int16x8_t t1 = vec_mergel(v0, v1);
+
+ uint8x16_t dest0 = vec_vsx_ld(0, dest);
+ uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+ uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+ uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+ uint8x16_t zerov = vec_splat_u8(0);
+ int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+ int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+ int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+ int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+ uint8x16_t output_v;
+ uint8_t tmp_dest[16];
+ ROUND_SHIFT_INIT
+ PIXEL_ADD_INIT;
+
+ v0 = vec_mergeh(t0, t1);
+ v1 = vec_mergel(t0, t1);
+
+ IDCT4(v0, v1, t_out0, t_out1);
+ // transpose
+ t0 = vec_mergeh(t_out0, t_out1);
+ t1 = vec_mergel(t_out0, t_out1);
+ v0 = vec_mergeh(t0, t1);
+ v1 = vec_mergel(t0, t1);
+ IDCT4(v0, v1, t_out0, t_out1);
+
+ PIXEL_ADD4(v0, t_out0);
+ PIXEL_ADD4(v1, t_out1);
+ tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
+ tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
+ output_v = vec_packsu(tmp16_0, tmp16_1);
+
+ vec_vsx_st(output_v, 0, tmp_dest);
+ for (int i = 0; i < 4; i++)
+ for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+}
+
+#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ out0 = vec_mergeh(in0, in1); \
+ out1 = vec_mergel(in0, in1); \
+ out2 = vec_mergeh(in2, in3); \
+ out3 = vec_mergel(in2, in3); \
+ out4 = vec_mergeh(in4, in5); \
+ out5 = vec_mergel(in4, in5); \
+ out6 = vec_mergeh(in6, in7); \
+ out7 = vec_mergel(in6, in7); \
+ in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \
+ in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \
+ in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \
+ in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \
+ in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \
+ in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \
+ in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \
+ in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \
+ out0 = vec_perm(in0, in4, tr8_mask0); \
+ out1 = vec_perm(in0, in4, tr8_mask1); \
+ out2 = vec_perm(in1, in5, tr8_mask0); \
+ out3 = vec_perm(in1, in5, tr8_mask1); \
+ out4 = vec_perm(in2, in6, tr8_mask0); \
+ out5 = vec_perm(in2, in6, tr8_mask1); \
+ out6 = vec_perm(in3, in7, tr8_mask0); \
+ out7 = vec_perm(in3, in7, tr8_mask1);
+
+/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
+ * temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
+ temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+ temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+ tmp16_2 = vec_sub(inpt0, inpt1); \
+ tmp16_3 = vec_add(inpt0, inpt1); \
+ tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \
+ tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \
+ temp10 = vec_mule(tmp16_0, cospi); \
+ temp11 = vec_mule(tmp16_1, cospi); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_mulo(tmp16_0, cospi); \
+ temp11 = vec_mulo(tmp16_1, cospi); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \
+ /* stage 1 */ \
+ step0 = in0; \
+ step2 = in4; \
+ step1 = in2; \
+ step3 = in6; \
+ \
+ STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \
+ STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
+ \
+ /* stage 2 */ \
+ STEP8_1(step0, step2, in1, in0, cospi16_v); \
+ STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \
+ in4 = vec_add(step4, step5); \
+ in5 = vec_sub(step4, step5); \
+ in6 = vec_sub(step7, step6); \
+ in7 = vec_add(step6, step7); \
+ \
+ /* stage 3 */ \
+ step0 = vec_add(in0, in3); \
+ step1 = vec_add(in1, in2); \
+ step2 = vec_sub(in1, in2); \
+ step3 = vec_sub(in0, in3); \
+ step4 = in4; \
+ STEP8_1(in6, in5, step5, step6, cospi16_v); \
+ step7 = in7; \
+ \
+ /* stage 4 */ \
+ in0 = vec_add(step0, step7); \
+ in1 = vec_add(step1, step6); \
+ in2 = vec_add(step2, step5); \
+ in3 = vec_add(step3, step4); \
+ in4 = vec_sub(step3, step4); \
+ in5 = vec_sub(step2, step5); \
+ in6 = vec_sub(step1, step6); \
+ in7 = vec_sub(step0, step7);
+
+#define PIXEL_ADD(in, out, add, shiftx) \
+ out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
+
+static uint8x16_t tr8_mask0 = {
+ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+static uint8x16_t tr8_mask1 = {
+ 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+ 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+};
+void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int32x4_t temp10, temp11;
+ int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
+ int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
+ tmp16_2, tmp16_3;
+ int16x8_t src0 = load_tran_low(0, input);
+ int16x8_t src1 = load_tran_low(8 * sizeof(*input), input);
+ int16x8_t src2 = load_tran_low(16 * sizeof(*input), input);
+ int16x8_t src3 = load_tran_low(24 * sizeof(*input), input);
+ int16x8_t src4 = load_tran_low(32 * sizeof(*input), input);
+ int16x8_t src5 = load_tran_low(40 * sizeof(*input), input);
+ int16x8_t src6 = load_tran_low(48 * sizeof(*input), input);
+ int16x8_t src7 = load_tran_low(56 * sizeof(*input), input);
+ uint8x16_t dest0 = vec_vsx_ld(0, dest);
+ uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+ uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+ uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+ uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
+ uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
+ uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
+ uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
+ uint8x16_t zerov = vec_splat_u8(0);
+ int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+ int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+ int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+ int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+ int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
+ int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
+ int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
+ int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
+ int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
+ uint16x8_t shift5 = vec_splat_u16(5);
+ uint8x16_t output0, output1, output2, output3;
+ ROUND_SHIFT_INIT;
+
+ TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2,
+ tmp3, tmp4, tmp5, tmp6, tmp7);
+
+ IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
+ src3, src4, src5, src6, src7);
+ IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
+ PIXEL_ADD(src0, d_u0, add, shift5);
+ PIXEL_ADD(src1, d_u1, add, shift5);
+ PIXEL_ADD(src2, d_u2, add, shift5);
+ PIXEL_ADD(src3, d_u3, add, shift5);
+ PIXEL_ADD(src4, d_u4, add, shift5);
+ PIXEL_ADD(src5, d_u5, add, shift5);
+ PIXEL_ADD(src6, d_u6, add, shift5);
+ PIXEL_ADD(src7, d_u7, add, shift5);
+ output0 = vec_packsu(d_u0, d_u1);
+ output1 = vec_packsu(d_u2, d_u3);
+ output2 = vec_packsu(d_u4, d_u5);
+ output3 = vec_packsu(d_u6, d_u7);
+
+ vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
+ vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
+ vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
+ vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
+ vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
+ vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
+ vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
+ vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
+}
+
+#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \
+ in6, in7, in8, in9, inA, inB, inC, inD, inE, inF) \
+ in0 = load(offset, source); \
+ in1 = load((step) + (offset), source); \
+ in2 = load(2 * (step) + (offset), source); \
+ in3 = load(3 * (step) + (offset), source); \
+ in4 = load(4 * (step) + (offset), source); \
+ in5 = load(5 * (step) + (offset), source); \
+ in6 = load(6 * (step) + (offset), source); \
+ in7 = load(7 * (step) + (offset), source); \
+ in8 = load(8 * (step) + (offset), source); \
+ in9 = load(9 * (step) + (offset), source); \
+ inA = load(10 * (step) + (offset), source); \
+ inB = load(11 * (step) + (offset), source); \
+ inC = load(12 * (step) + (offset), source); \
+ inD = load(13 * (step) + (offset), source); \
+ inE = load(14 * (step) + (offset), source); \
+ inF = load(15 * (step) + (offset), source);
+
+#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_mule(tmp16_0, cospi); \
+ temp11 = vec_mule(tmp16_1, cospi); \
+ temp20 = vec_mulo(tmp16_0, cospi); \
+ temp21 = vec_mulo(tmp16_1, cospi); \
+ temp30 = vec_sub(temp10, temp20); \
+ temp10 = vec_add(temp10, temp20); \
+ temp20 = vec_sub(temp11, temp21); \
+ temp21 = vec_add(temp11, temp21); \
+ DCT_CONST_ROUND_SHIFT(temp30); \
+ DCT_CONST_ROUND_SHIFT(temp20); \
+ outpt0 = vec_packs(temp30, temp20); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp21); \
+ outpt1 = vec_packs(temp10, temp21);
+
+#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \
+ inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \
+ out7, out8, out9, outA, outB, outC, outD, outE, outF) \
+ /* stage 1 */ \
+ /* out0 = in0; */ \
+ out1 = in8; \
+ out2 = in4; \
+ out3 = inC; \
+ out4 = in2; \
+ out5 = inA; \
+ out6 = in6; \
+ out7 = inE; \
+ out8 = in1; \
+ out9 = in9; \
+ outA = in5; \
+ outB = inD; \
+ outC = in3; \
+ outD = inB; \
+ outE = in7; \
+ outF = inF; \
+ \
+ /* stage 2 */ \
+ /* in0 = out0; */ \
+ in1 = out1; \
+ in2 = out2; \
+ in3 = out3; \
+ in4 = out4; \
+ in5 = out5; \
+ in6 = out6; \
+ in7 = out7; \
+ \
+ STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \
+ STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \
+ STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \
+ STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \
+ \
+ /* stage 3 */ \
+ out0 = in0; \
+ out1 = in1; \
+ out2 = in2; \
+ out3 = in3; \
+ \
+ STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \
+ STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \
+ \
+ out8 = vec_add(in8, in9); \
+ out9 = vec_sub(in8, in9); \
+ outA = vec_sub(inB, inA); \
+ outB = vec_add(inA, inB); \
+ outC = vec_add(inC, inD); \
+ outD = vec_sub(inC, inD); \
+ outE = vec_sub(inF, inE); \
+ outF = vec_add(inE, inF); \
+ \
+ /* stage 4 */ \
+ STEP16_1(out0, out1, in1, in0, cospi16_v); \
+ STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \
+ in4 = vec_add(out4, out5); \
+ in5 = vec_sub(out4, out5); \
+ in6 = vec_sub(out7, out6); \
+ in7 = vec_add(out6, out7); \
+ \
+ in8 = out8; \
+ inF = outF; \
+ tmp16_0 = vec_mergeh(out9, outE); \
+ tmp16_1 = vec_mergel(out9, outE); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ in9 = vec_packs(temp10, temp11); \
+ temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+ temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ inE = vec_packs(temp10, temp11); \
+ \
+ tmp16_0 = vec_mergeh(outA, outD); \
+ tmp16_1 = vec_mergel(outA, outD); \
+ temp10 = \
+ vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \
+ temp11 = \
+ vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ inA = vec_packs(temp10, temp11); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ inD = vec_packs(temp10, temp11); \
+ \
+ inB = outB; \
+ inC = outC; \
+ \
+ /* stage 5 */ \
+ out0 = vec_add(in0, in3); \
+ out1 = vec_add(in1, in2); \
+ out2 = vec_sub(in1, in2); \
+ out3 = vec_sub(in0, in3); \
+ out4 = in4; \
+ STEP16_1(in6, in5, out5, out6, cospi16_v); \
+ out7 = in7; \
+ \
+ out8 = vec_add(in8, inB); \
+ out9 = vec_add(in9, inA); \
+ outA = vec_sub(in9, inA); \
+ outB = vec_sub(in8, inB); \
+ outC = vec_sub(inF, inC); \
+ outD = vec_sub(inE, inD); \
+ outE = vec_add(inD, inE); \
+ outF = vec_add(inC, inF); \
+ \
+ /* stage 6 */ \
+ in0 = vec_add(out0, out7); \
+ in1 = vec_add(out1, out6); \
+ in2 = vec_add(out2, out5); \
+ in3 = vec_add(out3, out4); \
+ in4 = vec_sub(out3, out4); \
+ in5 = vec_sub(out2, out5); \
+ in6 = vec_sub(out1, out6); \
+ in7 = vec_sub(out0, out7); \
+ in8 = out8; \
+ in9 = out9; \
+ STEP16_1(outD, outA, inA, inD, cospi16_v); \
+ STEP16_1(outC, outB, inB, inC, cospi16_v); \
+ inE = outE; \
+ inF = outF; \
+ \
+ /* stage 7 */ \
+ out0 = vec_add(in0, inF); \
+ out1 = vec_add(in1, inE); \
+ out2 = vec_add(in2, inD); \
+ out3 = vec_add(in3, inC); \
+ out4 = vec_add(in4, inB); \
+ out5 = vec_add(in5, inA); \
+ out6 = vec_add(in6, in9); \
+ out7 = vec_add(in7, in8); \
+ out8 = vec_sub(in7, in8); \
+ out9 = vec_sub(in6, in9); \
+ outA = vec_sub(in5, inA); \
+ outB = vec_sub(in4, inB); \
+ outC = vec_sub(in3, inC); \
+ outD = vec_sub(in2, inD); \
+ outE = vec_sub(in1, inE); \
+ outF = vec_sub(in0, inF);
+
+#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
+ d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
+ d_ul = (int16x8_t)vec_mergel(dst, zerov); \
+ PIXEL_ADD(in0, d_uh, add, shift6); \
+ PIXEL_ADD(in1, d_ul, add, shift6); \
+ vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
+
+void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int32x4_t temp10, temp11, temp20, temp21, temp30;
+ int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
+ src11, src12, src13, src14, src15, src16, src17;
+ int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
+ src31, src32, src33, src34, src35, src36, src37;
+ int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
+ tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
+ int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
+ tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
+ uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
+ dest9, destA, destB, destC, destD, destE, destF;
+ int16x8_t d_uh, d_ul;
+ int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+ uint16x8_t shift6 = vec_splat_u16(6);
+ uint8x16_t zerov = vec_splat_u8(0);
+ ROUND_SHIFT_INIT;
+
+ // transform rows
+ // load and transform the upper half of 16x16 matrix
+ LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01,
+ src11, src02, src12, src03, src13, src04, src14, src05, src15,
+ src06, src16, src07, src17);
+ TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
+ tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
+ TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
+ tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
+ IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
+ tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
+ src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
+ src16, src17);
+ TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
+ tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
+ TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
+ tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
+
+ // load and transform the lower half of 16x16 matrix
+ LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+ 8 * sizeof(*input), src20, src30, src21, src31, src22, src32,
+ src23, src33, src24, src34, src25, src35, src26, src36, src27,
+ src37);
+ TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
+ tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
+ TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
+ tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
+ IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
+ tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
+ src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
+ src36, src37);
+ TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
+ tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
+ TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
+ tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
+
+ // transform columns
+ // left half first
+ IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
+ tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
+ src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
+ src26, src27);
+ // right half
+ IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
+ tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
+ src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
+ src36, src37);
+
+ // load dest
+ LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4,
+ dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD,
+ destE, destF);
+
+ PIXEL_ADD_STORE16(src00, src10, dest0, 0);
+ PIXEL_ADD_STORE16(src01, src11, dest1, stride);
+ PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
+ PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
+ PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
+ PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
+ PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
+ PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
+
+ PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
+ PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
+ PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
+ PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
+ PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
+ PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
+ PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
+ PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
+}
+
+#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
+ in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
+ in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
+ in71, in72, in73, offset) \
+ /* load the first row from the 8x32 block*/ \
+ in00 = load(offset, input); \
+ in01 = load(offset + 16, input); \
+ in02 = load(offset + 2 * 16, input); \
+ in03 = load(offset + 3 * 16, input); \
+ \
+ in10 = load(offset + 4 * 16, input); \
+ in11 = load(offset + 5 * 16, input); \
+ in12 = load(offset + 6 * 16, input); \
+ in13 = load(offset + 7 * 16, input); \
+ \
+ in20 = load(offset + 8 * 16, input); \
+ in21 = load(offset + 9 * 16, input); \
+ in22 = load(offset + 10 * 16, input); \
+ in23 = load(offset + 11 * 16, input); \
+ \
+ in30 = load(offset + 12 * 16, input); \
+ in31 = load(offset + 13 * 16, input); \
+ in32 = load(offset + 14 * 16, input); \
+ in33 = load(offset + 15 * 16, input); \
+ \
+ in40 = load(offset + 16 * 16, input); \
+ in41 = load(offset + 17 * 16, input); \
+ in42 = load(offset + 18 * 16, input); \
+ in43 = load(offset + 19 * 16, input); \
+ \
+ in50 = load(offset + 20 * 16, input); \
+ in51 = load(offset + 21 * 16, input); \
+ in52 = load(offset + 22 * 16, input); \
+ in53 = load(offset + 23 * 16, input); \
+ \
+ in60 = load(offset + 24 * 16, input); \
+ in61 = load(offset + 25 * 16, input); \
+ in62 = load(offset + 26 * 16, input); \
+ in63 = load(offset + 27 * 16, input); \
+ \
+ /* load the last row from the 8x32 block*/ \
+ in70 = load(offset + 28 * 16, input); \
+ in71 = load(offset + 29 * 16, input); \
+ in72 = load(offset + 30 * 16, input); \
+ in73 = load(offset + 31 * 16, input);
+
+/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
+ * temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+ temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
+ * temp2 = -step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT32(in0, in1, in2, in3, out) \
+ \
+ /* stage 1 */ \
+ /* out[0][0] = in[0][0]; */ \
+ out[0][1] = in2[0]; \
+ out[0][2] = in1[0]; \
+ out[0][3] = in3[0]; \
+ out[0][4] = in0[4]; \
+ out[0][5] = in2[4]; \
+ out[0][6] = in1[4]; \
+ out[0][7] = in3[4]; \
+ out[1][0] = in0[2]; \
+ out[1][1] = in2[2]; \
+ out[1][2] = in1[2]; \
+ out[1][3] = in3[2]; \
+ out[1][4] = in0[6]; \
+ out[1][5] = in2[6]; \
+ out[1][6] = in1[6]; \
+ out[1][7] = in3[6]; \
+ \
+ STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \
+ STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
+ STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \
+ STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \
+ STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \
+ STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
+ STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
+ STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \
+ \
+ /* stage 2 */ \
+ /* in0[0] = out[0][0]; */ \
+ in0[1] = out[0][1]; \
+ in0[2] = out[0][2]; \
+ in0[3] = out[0][3]; \
+ in0[4] = out[0][4]; \
+ in0[5] = out[0][5]; \
+ in0[6] = out[0][6]; \
+ in0[7] = out[0][7]; \
+ \
+ STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \
+ STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
+ STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
+ STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \
+ \
+ in2[0] = vec_add(out[2][0], out[2][1]); \
+ in2[1] = vec_sub(out[2][0], out[2][1]); \
+ in2[2] = vec_sub(out[2][3], out[2][2]); \
+ in2[3] = vec_add(out[2][3], out[2][2]); \
+ in2[4] = vec_add(out[2][4], out[2][5]); \
+ in2[5] = vec_sub(out[2][4], out[2][5]); \
+ in2[6] = vec_sub(out[2][7], out[2][6]); \
+ in2[7] = vec_add(out[2][7], out[2][6]); \
+ in3[0] = vec_add(out[3][0], out[3][1]); \
+ in3[1] = vec_sub(out[3][0], out[3][1]); \
+ in3[2] = vec_sub(out[3][3], out[3][2]); \
+ in3[3] = vec_add(out[3][3], out[3][2]); \
+ in3[4] = vec_add(out[3][4], out[3][5]); \
+ in3[5] = vec_sub(out[3][4], out[3][5]); \
+ in3[6] = vec_sub(out[3][7], out[3][6]); \
+ in3[7] = vec_add(out[3][6], out[3][7]); \
+ \
+ /* stage 3 */ \
+ out[0][0] = in0[0]; \
+ out[0][1] = in0[1]; \
+ out[0][2] = in0[2]; \
+ out[0][3] = in0[3]; \
+ \
+ STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \
+ STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
+ \
+ out[1][0] = vec_add(in1[0], in1[1]); \
+ out[1][1] = vec_sub(in1[0], in1[1]); \
+ out[1][2] = vec_sub(in1[3], in1[2]); \
+ out[1][3] = vec_add(in1[2], in1[3]); \
+ out[1][4] = vec_add(in1[4], in1[5]); \
+ out[1][5] = vec_sub(in1[4], in1[5]); \
+ out[1][6] = vec_sub(in1[7], in1[6]); \
+ out[1][7] = vec_add(in1[6], in1[7]); \
+ \
+ out[2][0] = in2[0]; \
+ out[3][7] = in3[7]; \
+ STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \
+ STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \
+ cospi4m_v); \
+ out[2][3] = in2[3]; \
+ out[2][4] = in2[4]; \
+ STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \
+ STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
+ cospi20m_v); \
+ out[2][7] = in2[7]; \
+ out[3][0] = in3[0]; \
+ out[3][3] = in3[3]; \
+ out[3][4] = in3[4]; \
+ \
+ /* stage 4 */ \
+ STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \
+ STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \
+ in0[4] = vec_add(out[0][4], out[0][5]); \
+ in0[5] = vec_sub(out[0][4], out[0][5]); \
+ in0[6] = vec_sub(out[0][7], out[0][6]); \
+ in0[7] = vec_add(out[0][7], out[0][6]); \
+ \
+ in1[0] = out[1][0]; \
+ in1[7] = out[1][7]; \
+ STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \
+ STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \
+ cospi8m_v); \
+ in1[3] = out[1][3]; \
+ in1[4] = out[1][4]; \
+ \
+ in2[0] = vec_add(out[2][0], out[2][3]); \
+ in2[1] = vec_add(out[2][1], out[2][2]); \
+ in2[2] = vec_sub(out[2][1], out[2][2]); \
+ in2[3] = vec_sub(out[2][0], out[2][3]); \
+ in2[4] = vec_sub(out[2][7], out[2][4]); \
+ in2[5] = vec_sub(out[2][6], out[2][5]); \
+ in2[6] = vec_add(out[2][5], out[2][6]); \
+ in2[7] = vec_add(out[2][4], out[2][7]); \
+ \
+ in3[0] = vec_add(out[3][0], out[3][3]); \
+ in3[1] = vec_add(out[3][1], out[3][2]); \
+ in3[2] = vec_sub(out[3][1], out[3][2]); \
+ in3[3] = vec_sub(out[3][0], out[3][3]); \
+ in3[4] = vec_sub(out[3][7], out[3][4]); \
+ in3[5] = vec_sub(out[3][6], out[3][5]); \
+ in3[6] = vec_add(out[3][5], out[3][6]); \
+ in3[7] = vec_add(out[3][4], out[3][7]); \
+ \
+ /* stage 5 */ \
+ out[0][0] = vec_add(in0[0], in0[3]); \
+ out[0][1] = vec_add(in0[1], in0[2]); \
+ out[0][2] = vec_sub(in0[1], in0[2]); \
+ out[0][3] = vec_sub(in0[0], in0[3]); \
+ out[0][4] = in0[4]; \
+ STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \
+ out[0][7] = in0[7]; \
+ \
+ out[1][0] = vec_add(in1[0], in1[3]); \
+ out[1][1] = vec_add(in1[1], in1[2]); \
+ out[1][2] = vec_sub(in1[1], in1[2]); \
+ out[1][3] = vec_sub(in1[0], in1[3]); \
+ out[1][4] = vec_sub(in1[7], in1[4]); \
+ out[1][5] = vec_sub(in1[6], in1[5]); \
+ out[1][6] = vec_add(in1[5], in1[6]); \
+ out[1][7] = vec_add(in1[4], in1[7]); \
+ \
+ out[2][0] = in2[0]; \
+ out[2][1] = in2[1]; \
+ STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \
+ STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \
+ STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \
+ cospi8m_v); \
+ STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \
+ cospi8m_v); \
+ out[2][6] = in2[6]; \
+ out[2][7] = in2[7]; \
+ out[3][0] = in3[0]; \
+ out[3][1] = in3[1]; \
+ out[3][6] = in3[6]; \
+ out[3][7] = in3[7]; \
+ \
+ /* stage 6 */ \
+ in0[0] = vec_add(out[0][0], out[0][7]); \
+ in0[1] = vec_add(out[0][1], out[0][6]); \
+ in0[2] = vec_add(out[0][2], out[0][5]); \
+ in0[3] = vec_add(out[0][3], out[0][4]); \
+ in0[4] = vec_sub(out[0][3], out[0][4]); \
+ in0[5] = vec_sub(out[0][2], out[0][5]); \
+ in0[6] = vec_sub(out[0][1], out[0][6]); \
+ in0[7] = vec_sub(out[0][0], out[0][7]); \
+ in1[0] = out[1][0]; \
+ in1[1] = out[1][1]; \
+ STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \
+ STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \
+ in1[6] = out[1][6]; \
+ in1[7] = out[1][7]; \
+ \
+ in2[0] = vec_add(out[2][0], out[2][7]); \
+ in2[1] = vec_add(out[2][1], out[2][6]); \
+ in2[2] = vec_add(out[2][2], out[2][5]); \
+ in2[3] = vec_add(out[2][3], out[2][4]); \
+ in2[4] = vec_sub(out[2][3], out[2][4]); \
+ in2[5] = vec_sub(out[2][2], out[2][5]); \
+ in2[6] = vec_sub(out[2][1], out[2][6]); \
+ in2[7] = vec_sub(out[2][0], out[2][7]); \
+ \
+ in3[0] = vec_sub(out[3][7], out[3][0]); \
+ in3[1] = vec_sub(out[3][6], out[3][1]); \
+ in3[2] = vec_sub(out[3][5], out[3][2]); \
+ in3[3] = vec_sub(out[3][4], out[3][3]); \
+ in3[4] = vec_add(out[3][4], out[3][3]); \
+ in3[5] = vec_add(out[3][5], out[3][2]); \
+ in3[6] = vec_add(out[3][6], out[3][1]); \
+ in3[7] = vec_add(out[3][7], out[3][0]); \
+ \
+ /* stage 7 */ \
+ out[0][0] = vec_add(in0[0], in1[7]); \
+ out[0][1] = vec_add(in0[1], in1[6]); \
+ out[0][2] = vec_add(in0[2], in1[5]); \
+ out[0][3] = vec_add(in0[3], in1[4]); \
+ out[0][4] = vec_add(in0[4], in1[3]); \
+ out[0][5] = vec_add(in0[5], in1[2]); \
+ out[0][6] = vec_add(in0[6], in1[1]); \
+ out[0][7] = vec_add(in0[7], in1[0]); \
+ out[1][0] = vec_sub(in0[7], in1[0]); \
+ out[1][1] = vec_sub(in0[6], in1[1]); \
+ out[1][2] = vec_sub(in0[5], in1[2]); \
+ out[1][3] = vec_sub(in0[4], in1[3]); \
+ out[1][4] = vec_sub(in0[3], in1[4]); \
+ out[1][5] = vec_sub(in0[2], in1[5]); \
+ out[1][6] = vec_sub(in0[1], in1[6]); \
+ out[1][7] = vec_sub(in0[0], in1[7]); \
+ \
+ out[2][0] = in2[0]; \
+ out[2][1] = in2[1]; \
+ out[2][2] = in2[2]; \
+ out[2][3] = in2[3]; \
+ STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \
+ STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \
+ STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \
+ STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \
+ out[3][4] = in3[4]; \
+ out[3][5] = in3[5]; \
+ out[3][6] = in3[6]; \
+ out[3][7] = in3[7]; \
+ \
+ /* final */ \
+ in0[0] = vec_add(out[0][0], out[3][7]); \
+ in0[1] = vec_add(out[0][1], out[3][6]); \
+ in0[2] = vec_add(out[0][2], out[3][5]); \
+ in0[3] = vec_add(out[0][3], out[3][4]); \
+ in0[4] = vec_add(out[0][4], out[3][3]); \
+ in0[5] = vec_add(out[0][5], out[3][2]); \
+ in0[6] = vec_add(out[0][6], out[3][1]); \
+ in0[7] = vec_add(out[0][7], out[3][0]); \
+ in1[0] = vec_add(out[1][0], out[2][7]); \
+ in1[1] = vec_add(out[1][1], out[2][6]); \
+ in1[2] = vec_add(out[1][2], out[2][5]); \
+ in1[3] = vec_add(out[1][3], out[2][4]); \
+ in1[4] = vec_add(out[1][4], out[2][3]); \
+ in1[5] = vec_add(out[1][5], out[2][2]); \
+ in1[6] = vec_add(out[1][6], out[2][1]); \
+ in1[7] = vec_add(out[1][7], out[2][0]); \
+ in2[0] = vec_sub(out[1][7], out[2][0]); \
+ in2[1] = vec_sub(out[1][6], out[2][1]); \
+ in2[2] = vec_sub(out[1][5], out[2][2]); \
+ in2[3] = vec_sub(out[1][4], out[2][3]); \
+ in2[4] = vec_sub(out[1][3], out[2][4]); \
+ in2[5] = vec_sub(out[1][2], out[2][5]); \
+ in2[6] = vec_sub(out[1][1], out[2][6]); \
+ in2[7] = vec_sub(out[1][0], out[2][7]); \
+ in3[0] = vec_sub(out[0][7], out[3][0]); \
+ in3[1] = vec_sub(out[0][6], out[3][1]); \
+ in3[2] = vec_sub(out[0][5], out[3][2]); \
+ in3[3] = vec_sub(out[0][4], out[3][3]); \
+ in3[4] = vec_sub(out[0][3], out[3][4]); \
+ in3[5] = vec_sub(out[0][2], out[3][5]); \
+ in3[6] = vec_sub(out[0][1], out[3][6]); \
+ in3[7] = vec_sub(out[0][0], out[3][7]);
+
+// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
+// does not transpose rows
+#define TRANSPOSE_8x32(in, out) \
+ /* transpose 4 of 8x8 blocks */ \
+ TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \
+ in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
+ out[0][4], out[0][5], out[0][6], out[0][7]); \
+ TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \
+ in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
+ out[1][4], out[1][5], out[1][6], out[1][7]); \
+ TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \
+ in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
+ out[2][4], out[2][5], out[2][6], out[2][7]); \
+ TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \
+ in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
+ out[3][4], out[3][5], out[3][6], out[3][7]);
+
+#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \
+ dst = vec_vsx_ld((step)*stride, dest); \
+ d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
+ d_ul = (int16x8_t)vec_mergel(dst, zerov); \
+ PIXEL_ADD(in0, d_uh, add, shift6); \
+ PIXEL_ADD(in1, d_ul, add, shift6); \
+ vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
+ dst = vec_vsx_ld((step)*stride + 16, dest); \
+ d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
+ d_ul = (int16x8_t)vec_mergel(dst, zerov); \
+ PIXEL_ADD(in2, d_uh, add, shift6); \
+ PIXEL_ADD(in3, d_ul, add, shift6); \
+ vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
+
+#define ADD_STORE_BLOCK(in, offset) \
+ PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
+ PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
+ PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
+ PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
+ PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
+ PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
+ PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
+ PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
+
+void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
+ int16x8_t tmp16_0, tmp16_1;
+ int32x4_t temp10, temp11, temp20, temp21, temp30;
+ uint8x16_t dst;
+ int16x8_t d_uh, d_ul;
+ int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+ uint16x8_t shift6 = vec_splat_u16(6);
+ uint8x16_t zerov = vec_splat_u8(0);
+
+ ROUND_SHIFT_INIT;
+
+ LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
+ src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
+ src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
+ src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
+ src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
+ src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
+ src0[1][7], src0[2][7], src0[3][7], 0);
+ // Rows
+ // transpose the first row of 8x8 blocks
+ TRANSPOSE_8x32(src0, tmp);
+ // transform the 32x8 column
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
+ TRANSPOSE_8x32(tmp, src0);
+
+ LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
+ src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
+ src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
+ src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
+ src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
+ src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
+ src1[1][7], src1[2][7], src1[3][7], 512);
+ TRANSPOSE_8x32(src1, tmp);
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
+ TRANSPOSE_8x32(tmp, src1);
+
+ LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
+ src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
+ src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
+ src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
+ src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
+ src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
+ src2[1][7], src2[2][7], src2[3][7], 1024);
+ TRANSPOSE_8x32(src2, tmp);
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
+ TRANSPOSE_8x32(tmp, src2);
+
+ LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
+ src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
+ src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
+ src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
+ src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
+ src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
+ src3[1][7], src3[2][7], src3[3][7], 1536);
+ TRANSPOSE_8x32(src3, tmp);
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
+ TRANSPOSE_8x32(tmp, src3);
+
+ // Columns
+ IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
+ IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
+ IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
+ IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
+
+ ADD_STORE_BLOCK(src0, 0);
+ ADD_STORE_BLOCK(src1, 8);
+ ADD_STORE_BLOCK(src2, 16);
+ ADD_STORE_BLOCK(src3, 24);
+}
diff --git a/libvpx/vpx_dsp/ppc/sad_vsx.c b/libvpx/vpx_dsp/ppc/sad_vsx.c
index 3edb40c31..bb49addae 100644
--- a/libvpx/vpx_dsp/ppc/sad_vsx.c
+++ b/libvpx/vpx_dsp/ppc/sad_vsx.c
@@ -10,9 +10,12 @@
#include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
+
#include "vpx_dsp/ppc/types_vsx.h"
#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
#define PROCESS16(offset) \
v_a = vec_vsx_ld(offset, a); \
@@ -100,3 +103,152 @@ SAD32(32);
SAD32(64);
SAD64(32);
SAD64(64);
+
+#define SAD16AVG(height) \
+ unsigned int vpx_sad16x##height##_avg_vsx( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]); \
+ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \
+ ref_stride); \
+ \
+ return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \
+ }
+
+#define SAD32AVG(height) \
+ unsigned int vpx_sad32x##height##_avg_vsx( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]); \
+ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \
+ ref_stride); \
+ \
+ return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \
+ }
+
+#define SAD64AVG(height) \
+ unsigned int vpx_sad64x##height##_avg_vsx( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]); \
+ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \
+ ref_stride); \
+ return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \
+ }
+
+SAD16AVG(8);
+SAD16AVG(16);
+SAD16AVG(32);
+SAD32AVG(16);
+SAD32AVG(32);
+SAD32AVG(64);
+SAD64AVG(32);
+SAD64AVG(64);
+
+#define PROCESS16_4D(offset, ref, v_h, v_l) \
+ v_b = vec_vsx_ld(offset, ref); \
+ v_bh = unpack_to_s16_h(v_b); \
+ v_bl = unpack_to_s16_l(v_b); \
+ v_subh = vec_sub(v_h, v_bh); \
+ v_subl = vec_sub(v_l, v_bl); \
+ v_absh = vec_abs(v_subh); \
+ v_absl = vec_abs(v_subl); \
+ v_sad = vec_sum4s(v_absh, v_sad); \
+ v_sad = vec_sum4s(v_absl, v_sad);
+
+#define UNPACK_SRC(offset, srcv_h, srcv_l) \
+ v_a = vec_vsx_ld(offset, src); \
+ srcv_h = unpack_to_s16_h(v_a); \
+ srcv_l = unpack_to_s16_l(v_a);
+
+#define SAD16_4D(height) \
+ void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \
+ \
+ for (i = 0; i < 4; i++) sad_array[i] = 0; \
+ \
+ for (y = 0; y < height; y++) { \
+ UNPACK_SRC(y *src_stride, v_ah, v_al); \
+ for (i = 0; i < 4; i++) { \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \
+ \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
+ } \
+ } \
+ }
+
+#define SAD32_4D(height) \
+ void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \
+ int16x8_t v_absh, v_absl, v_subh, v_subl; \
+ \
+ for (i = 0; i < 4; i++) sad_array[i] = 0; \
+ \
+ for (y = 0; y < height; y++) { \
+ UNPACK_SRC(y *src_stride, v_ah1, v_al1); \
+ UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \
+ for (i = 0; i < 4; i++) { \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \
+ PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \
+ \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
+ } \
+ } \
+ }
+
+#define SAD64_4D(height) \
+ void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \
+ int16x8_t v_ah3, v_al3, v_ah4, v_al4; \
+ int16x8_t v_absh, v_absl, v_subh, v_subl; \
+ \
+ for (i = 0; i < 4; i++) sad_array[i] = 0; \
+ \
+ for (y = 0; y < height; y++) { \
+ UNPACK_SRC(y *src_stride, v_ah1, v_al1); \
+ UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \
+ UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \
+ UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \
+ for (i = 0; i < 4; i++) { \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \
+ PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \
+ PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \
+ PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \
+ \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
+ } \
+ } \
+ }
+
+SAD16_4D(8);
+SAD16_4D(16);
+SAD16_4D(32);
+SAD32_4D(16);
+SAD32_4D(32);
+SAD32_4D(64);
+SAD64_4D(32);
+SAD64_4D(64);
diff --git a/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
index 55dcdc2ba..5c3ba4576 100644
--- a/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
+++ b/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -53,13 +53,13 @@ static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
- int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
switch (w) {
case 16: {
@@ -132,14 +132,8 @@ static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
- int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
-
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
switch (w) {
case 16: {
avg_w16(src, src_stride, dst, dst_stride, h);
@@ -154,8 +148,8 @@ void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
break;
}
default: {
- vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x,
- filter_x_stride, filter_y, filter_y_stride, w, h);
+ vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
}
@@ -299,9 +293,9 @@ static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters, int x0_q4,
- int x_step_q4, const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
+ const InterpKernel *const filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -324,95 +318,77 @@ static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 <= 32);
convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
- x_filters, x0_q4, x_step_q4, w, intermediate_height);
- convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
- y_filters, y0_q4, y_step_q4, w, h);
+ filter, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+ y0_q4, y_step_q4, w, h);
}
void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
+ (void)y0_q4;
(void)y_step_q4;
- convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
- w, h);
+ convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+ h);
}
void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
+ (void)y0_q4;
(void)y_step_q4;
- convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, w, h);
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ w, h);
}
void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
+ (void)x0_q4;
(void)x_step_q4;
- convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
- w, h);
+ convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+ h);
}
void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
+ (void)x0_q4;
(void)x_step_q4;
- convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
- y_step_q4, w, h);
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+ w, h);
}
void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h);
+ convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
}
void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
// Fixed size intermediate buffer places limits on parameters.
DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
- vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+ vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
y_step_q4, w, h);
- vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+ vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
}
diff --git a/libvpx/vpx_dsp/quantize.c b/libvpx/vpx_dsp/quantize.c
index 3c7f9832f..e37ca92ad 100644
--- a/libvpx/vpx_dsp/quantize.c
+++ b/libvpx/vpx_dsp/quantize.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
+
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/quantize.h"
#include "vpx_mem/vpx_mem.h"
@@ -123,40 +125,40 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = (int)n_coeffs - 1; i >= 0; i--) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
-
- if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
- non_zero_count--;
- else
- break;
- }
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+ non_zero_count--;
+ else
+ break;
+ }
- // Quantization pass: All coefficients with index >= zero_flag are
- // skippable. Note: zero_flag can be zero.
- for (i = 0; i < non_zero_count; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
- if (abs_coeff >= zbins[rc != 0]) {
- int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
- tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
- quant_shift_ptr[rc != 0]) >>
- 16; // quantization
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
- if (tmp) eob = i;
- }
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= zbins[rc != 0]) {
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ 16; // quantization
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+ if (tmp) eob = i;
}
}
*eob_ptr = eob + 1;
@@ -174,39 +176,38 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = (int)n_coeffs - 1; i >= 0; i--) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
-
- if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
- non_zero_count--;
- else
- break;
- }
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+ non_zero_count--;
+ else
+ break;
+ }
- // Quantization pass: All coefficients with index >= zero_flag are
- // skippable. Note: zero_flag can be zero.
- for (i = 0; i < non_zero_count; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
- if (abs_coeff >= zbins[rc != 0]) {
- const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
- const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
- qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
- if (abs_qcoeff) eob = i;
- }
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= zbins[rc != 0]) {
+ const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ if (abs_qcoeff) eob = i;
}
}
*eob_ptr = eob + 1;
@@ -228,41 +229,40 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int idx_arr[1024];
int i, eob = -1;
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = 0; i < n_coeffs; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
-
- // If the coefficient is out of the base ZBIN range, keep it for
- // quantization.
- if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
- idx_arr[idx++] = i;
- }
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = scan[idx_arr[i]];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- int tmp;
- int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
- tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
- quant_shift_ptr[rc != 0]) >>
- 15;
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+ }
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ int tmp;
+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+ quant_shift_ptr[rc != 0]) >>
+ 15;
- if (tmp) eob = idx_arr[i];
- }
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+ if (tmp) eob = idx_arr[i];
}
*eob_ptr = eob + 1;
}
@@ -282,38 +282,35 @@ void vpx_highbd_quantize_b_32x32_c(
int idx_arr[1024];
int i, eob = -1;
(void)iscan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = 0; i < n_coeffs; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
-
- // If the coefficient is out of the base ZBIN range, keep it for
- // quantization.
- if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
- idx_arr[idx++] = i;
- }
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = scan[idx_arr[i]];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 =
- abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (abs_qcoeff) eob = idx_arr[i];
- }
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = idx_arr[i];
}
*eob_ptr = eob + 1;
}
diff --git a/libvpx/vpx_dsp/sad.c b/libvpx/vpx_dsp/sad.c
index 6ceb37e43..18b6dc6e0 100644
--- a/libvpx/vpx_dsp/sad.c
+++ b/libvpx/vpx_dsp/sad.c
@@ -70,8 +70,6 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
/* clang-format off */
// 64x64
sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
sadMxNx4D(64, 64)
// 64x32
@@ -84,8 +82,6 @@ sadMxNx4D(32, 64)
// 32x32
sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
sadMxNx4D(32, 32)
// 32x16
@@ -122,12 +118,10 @@ sadMxNx4D(8, 8)
// 8x4
sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
sadMxNx4D(8, 4)
// 4x8
sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
sadMxNx4D(4, 8)
// 4x4
@@ -183,17 +177,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
}
-#define highbd_sadMxNxK(m, n, k) \
- void vpx_highbd_sad##m##x##n##x##k##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref_array, \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < k; ++i) { \
- sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \
- &ref_array[i], ref_stride); \
- } \
- }
-
#define highbd_sadMxNx4D(m, n) \
void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[], \
@@ -208,8 +191,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
/* clang-format off */
// 64x64
highbd_sadMxN(64, 64)
-highbd_sadMxNxK(64, 64, 3)
-highbd_sadMxNxK(64, 64, 8)
highbd_sadMxNx4D(64, 64)
// 64x32
@@ -222,8 +203,6 @@ highbd_sadMxNx4D(32, 64)
// 32x32
highbd_sadMxN(32, 32)
-highbd_sadMxNxK(32, 32, 3)
-highbd_sadMxNxK(32, 32, 8)
highbd_sadMxNx4D(32, 32)
// 32x16
@@ -236,42 +215,30 @@ highbd_sadMxNx4D(16, 32)
// 16x16
highbd_sadMxN(16, 16)
-highbd_sadMxNxK(16, 16, 3)
-highbd_sadMxNxK(16, 16, 8)
highbd_sadMxNx4D(16, 16)
// 16x8
highbd_sadMxN(16, 8)
-highbd_sadMxNxK(16, 8, 3)
-highbd_sadMxNxK(16, 8, 8)
highbd_sadMxNx4D(16, 8)
// 8x16
highbd_sadMxN(8, 16)
-highbd_sadMxNxK(8, 16, 3)
-highbd_sadMxNxK(8, 16, 8)
highbd_sadMxNx4D(8, 16)
// 8x8
highbd_sadMxN(8, 8)
-highbd_sadMxNxK(8, 8, 3)
-highbd_sadMxNxK(8, 8, 8)
highbd_sadMxNx4D(8, 8)
// 8x4
highbd_sadMxN(8, 4)
-highbd_sadMxNxK(8, 4, 8)
highbd_sadMxNx4D(8, 4)
// 4x8
highbd_sadMxN(4, 8)
-highbd_sadMxNxK(4, 8, 8)
highbd_sadMxNx4D(4, 8)
// 4x4
highbd_sadMxN(4, 4)
-highbd_sadMxNxK(4, 4, 3)
-highbd_sadMxNxK(4, 4, 8)
highbd_sadMxNx4D(4, 4)
/* clang-format on */
diff --git a/libvpx/vpx_dsp/skin_detection.c b/libvpx/vpx_dsp/skin_detection.c
new file mode 100644
index 000000000..bbbb6c3a1
--- /dev/null
+++ b/libvpx/vpx_dsp/skin_detection.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+
+#define MODEL_MODE 1
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = { { 7463, 9614 },
+ { 6400, 10240 },
+ { 7040, 10240 },
+ { 8320, 9280 },
+ { 6800, 9614 } };
+static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16
+static const int skin_threshold[6] = { 1570636, 1400000, 800000,
+ 800000, 800000, 800000 }; // q18
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int vpx_evaluate_skin_color_difference(const int cb, const int cr,
+ const int idx) {
+ const int cb_q6 = cb << 6;
+ const int cr_q6 = cr << 6;
+ const int cb_diff_q12 =
+ (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+ const int cbcr_diff_q12 =
+ (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+ const int cr_diff_q12 =
+ (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+ const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+ const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+ const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+ const int skin_diff =
+ skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+ skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+ return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) {
+ if (y < y_low || y > y_high) {
+ return 0;
+ } else if (MODEL_MODE == 0) {
+ return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+ } else {
+ int i = 0;
+ // Exit on grey.
+ if (cb == 128 && cr == 128) return 0;
+ // Exit on very strong cb.
+ if (cb > 150 && cr < 110) return 0;
+ for (; i < 5; ++i) {
+ int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i);
+ if (skin_color_diff < skin_threshold[i + 1]) {
+ if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
+ return 0;
+ } else if (motion == 0 &&
+ skin_color_diff > (skin_threshold[i + 1] >> 1)) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+ // Exit if difference is much large than the threshold.
+ if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+ return 0;
+ }
+ }
+ return 0;
+ }
+}
diff --git a/libvpx/vpx_dsp/skin_detection.h b/libvpx/vpx_dsp/skin_detection.h
new file mode 100644
index 000000000..a2e99baf7
--- /dev/null
+++ b/libvpx/vpx_dsp/skin_detection.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_SKIN_DETECTION_H_
+#define VPX_DSP_SKIN_DETECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_DSP_SKIN_DETECTION_H_
diff --git a/libvpx/vpx_dsp/txfm_common.h b/libvpx/vpx_dsp/txfm_common.h
index fd27f928e..d01d7085a 100644
--- a/libvpx/vpx_dsp/txfm_common.h
+++ b/libvpx/vpx_dsp/txfm_common.h
@@ -25,42 +25,42 @@
// printf("static const int cospi_%d_64 = %.0f;\n", i,
// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
+static const tran_coef_t cospi_1_64 = 16364;
+static const tran_coef_t cospi_2_64 = 16305;
+static const tran_coef_t cospi_3_64 = 16207;
+static const tran_coef_t cospi_4_64 = 16069;
+static const tran_coef_t cospi_5_64 = 15893;
+static const tran_coef_t cospi_6_64 = 15679;
+static const tran_coef_t cospi_7_64 = 15426;
+static const tran_coef_t cospi_8_64 = 15137;
+static const tran_coef_t cospi_9_64 = 14811;
+static const tran_coef_t cospi_10_64 = 14449;
+static const tran_coef_t cospi_11_64 = 14053;
+static const tran_coef_t cospi_12_64 = 13623;
+static const tran_coef_t cospi_13_64 = 13160;
+static const tran_coef_t cospi_14_64 = 12665;
+static const tran_coef_t cospi_15_64 = 12140;
+static const tran_coef_t cospi_16_64 = 11585;
+static const tran_coef_t cospi_17_64 = 11003;
+static const tran_coef_t cospi_18_64 = 10394;
+static const tran_coef_t cospi_19_64 = 9760;
+static const tran_coef_t cospi_20_64 = 9102;
+static const tran_coef_t cospi_21_64 = 8423;
+static const tran_coef_t cospi_22_64 = 7723;
+static const tran_coef_t cospi_23_64 = 7005;
+static const tran_coef_t cospi_24_64 = 6270;
+static const tran_coef_t cospi_25_64 = 5520;
+static const tran_coef_t cospi_26_64 = 4756;
+static const tran_coef_t cospi_27_64 = 3981;
+static const tran_coef_t cospi_28_64 = 3196;
+static const tran_coef_t cospi_29_64 = 2404;
+static const tran_coef_t cospi_30_64 = 1606;
+static const tran_coef_t cospi_31_64 = 804;
// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
+static const tran_coef_t sinpi_1_9 = 5283;
+static const tran_coef_t sinpi_2_9 = 9929;
+static const tran_coef_t sinpi_3_9 = 13377;
+static const tran_coef_t sinpi_4_9 = 15212;
#endif // VPX_DSP_TXFM_COMMON_H_
diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c
index b1744047a..93bd8f30d 100644
--- a/libvpx/vpx_dsp/variance.c
+++ b/libvpx/vpx_dsp/variance.c
@@ -8,8 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <assert.h>
-
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
@@ -166,7 +164,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
- vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+ vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
\
return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
}
@@ -226,9 +224,6 @@ MSE(8, 8)
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
int i, j;
- /* comp_pred and pred must be 16 byte aligned. */
- assert(((intptr_t)comp_pred & 0xf) == 0);
- assert(((intptr_t)pred & 0xf) == 0);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
@@ -468,8 +463,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
dst, dst_stride, sse); \
@@ -488,8 +483,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
dst, dst_stride, sse); \
@@ -508,8 +503,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
dst, dst_stride, sse); \
diff --git a/libvpx/vpx_dsp/variance.h b/libvpx/vpx_dsp/variance.h
index 4c482551e..100573299 100644
--- a/libvpx/vpx_dsp/variance.h
+++ b/libvpx/vpx_dsp/variance.h
@@ -74,8 +74,6 @@ typedef struct vp9_variance_vtable {
vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
vpx_subp_avg_variance_fn_t svaf;
- vpx_sad_multi_fn_t sdx3f;
- vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
#endif // CONFIG_VP9
diff --git a/libvpx/vpx_dsp/vpx_convolve.c b/libvpx/vpx_dsp/vpx_convolve.c
index 02c5a955a..e55a963f9 100644
--- a/libvpx/vpx_dsp/vpx_convolve.c
+++ b/libvpx/vpx_dsp/vpx_convolve.c
@@ -113,135 +113,107 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint8_t temp[64 * 135];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
- x_filters, x0_q4, x_step_q4, w, intermediate_height);
- convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
- y_filters, y0_q4, y_step_q4, w, h);
-}
-
void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ (void)y0_q4;
(void)y_step_q4;
-
- convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
- w, h);
+ convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+ h);
}
void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
+ (void)y0_q4;
(void)y_step_q4;
-
- convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, w, h);
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ w, h);
}
void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ (void)x0_q4;
(void)x_step_q4;
-
- convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
- w, h);
+ convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+ h);
}
void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
+ (void)x0_q4;
(void)x_step_q4;
-
- convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
- y_step_q4, w, h);
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+ w, h);
}
void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h);
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ uint8_t temp[64 * 135];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ filter, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+ y0_q4, y_step_q4, w, h);
}
void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
// Fixed size intermediate buffer places limits on parameters.
DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
- vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+ vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
y_step_q4, w, h);
- vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+ vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
}
void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int filter_x_stride, const int16_t *filter_y,
- int filter_y_stride, int w, int h) {
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
int r;
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
for (r = h; r > 0; --r) {
memcpy(dst, src, w);
@@ -251,15 +223,16 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
}
void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int filter_x_stride, const int16_t *filter_y,
- int filter_y_stride, int w, int h) {
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
int x, y;
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
@@ -269,53 +242,52 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
}
void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
}
void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
}
#if CONFIG_VP9_HIGHBITDEPTH
@@ -417,9 +389,9 @@ static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters, int x0_q4,
- int x_step_q4, const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4, int w, int h, int bd) {
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -442,113 +414,97 @@ static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
assert(x_step_q4 <= 32);
highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
- temp, 64, x_filters, x0_q4, x_step_q4, w,
+ temp, 64, filter, x0_q4, x_step_q4, w,
intermediate_height, bd);
highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
- y_filters, y0_q4, y_step_q4, w, h, bd);
+ filter, y0_q4, y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)y0_q4;
(void)y_step_q4;
- highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
+ (void)y0_q4;
(void)y_step_q4;
- highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h, int bd) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
+ (void)x0_q4;
(void)x_step_q4;
- highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
+ (void)x0_q4;
(void)x_step_q4;
- highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h, bd);
+ highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h, int bd) {
// Fixed size intermediate buffer places limits on parameters.
DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
- vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h, bd);
- vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h,
+ vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h, bd);
+ vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,
bd);
}
void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h, int bd) {
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
int r;
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
(void)bd;
for (r = h; r > 0; --r) {
@@ -560,15 +516,16 @@ void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h, int bd) {
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
int x, y;
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
(void)bd;
for (y = 0; y < h; ++y) {
diff --git a/libvpx/vpx_dsp/vpx_convolve.h b/libvpx/vpx_dsp/vpx_convolve.h
index 1aedd32bd..7979268a9 100644
--- a/libvpx/vpx_dsp/vpx_convolve.h
+++ b/libvpx/vpx_dsp/vpx_convolve.h
@@ -19,15 +19,15 @@ extern "C" {
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
int h);
#if CONFIG_VP9_HIGHBITDEPTH
typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd);
#endif
diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
index 6ac7182ab..3b1a873cd 100644
--- a/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libvpx/vpx_dsp/vpx_dsp.mk
@@ -50,12 +50,13 @@ DSP_SRCS-yes += intrapred.c
DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
@@ -87,6 +88,8 @@ DSP_SRCS-yes += vpx_filter.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h
+DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
@@ -104,6 +107,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c
endif
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
@@ -194,6 +198,9 @@ endif
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
@@ -207,10 +214,13 @@ DSP_SRCS-yes += inv_txfm.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h
DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c
DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
+DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c
+
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
@@ -237,6 +247,11 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
@@ -264,18 +279,19 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
DSP_SRCS-yes += quantize.c
DSP_SRCS-yes += quantize.h
+DSP_SRCS-$(HAVE_SSE2) += x86/quantize_x86.h
DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c
+DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
endif
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
-DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
-endif
# avg
DSP_SRCS-yes += avg.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
@@ -286,6 +302,10 @@ DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c
endif # CONFIG_VP9_ENCODER
+# skin detection
+DSP_SRCS-yes += skin_detection.h
+DSP_SRCS-yes += skin_detection.c
+
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
@@ -300,11 +320,15 @@ DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
+DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c
+DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c
+
DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm
DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm
@@ -325,17 +349,19 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)
DSP_SRCS-yes += variance.c
DSP_SRCS-yes += variance.h
+DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
+DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c
+
DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
-DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c
ifeq ($(ARCH_X86_64),yes)
@@ -354,7 +380,9 @@ endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
# Neon utilities
DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h
# PPC VSX utilities
DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h
@@ -362,6 +390,7 @@ DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h
DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
# X86 utilities
+DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
diff --git a/libvpx/vpx_dsp/vpx_dsp_common.h b/libvpx/vpx_dsp/vpx_dsp_common.h
index 49d36e545..c8c852374 100644
--- a/libvpx/vpx_dsp/vpx_dsp_common.h
+++ b/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -43,6 +43,8 @@ typedef int32_t tran_high_t;
typedef int16_t tran_low_t;
#endif // CONFIG_VP9_HIGHBITDEPTH
+typedef int16_t tran_coef_t;
+
static INLINE uint8_t clip_pixel(int val) {
return (val > 255) ? 255 : (val < 0) ? 0 : val;
}
@@ -55,7 +57,6 @@ static INLINE double fclamp(double value, double low, double high) {
return value < low ? low : (value > high ? high : value);
}
-#if CONFIG_VP9_HIGHBITDEPTH
static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
switch (bd) {
case 8:
@@ -64,7 +65,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
case 12: return (uint16_t)clamp(val, 0, 4095);
}
}
-#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c67483641..1a743d910 100644
--- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
sub vpx_dsp_forward_decls() {
print <<EOF
/*
@@ -6,6 +16,7 @@ print <<EOF
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
EOF
}
@@ -19,6 +30,7 @@ if ($opts{arch} eq "x86_64") {
$ssse3_x86_64 = 'ssse3';
$avx_x86_64 = 'avx';
$avx2_x86_64 = 'avx2';
+ $avx512_x86_64 = 'avx512';
}
#
@@ -188,21 +200,25 @@ specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
# High bitdepth functions
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
+ specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_h_predictor_4x4 neon/;
+ specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
+ specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
@@ -214,30 +230,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/;
+ specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/;
+ specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
+ specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
+ specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_h_predictor_8x8 neon/;
+ specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
+ specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
@@ -249,30 +269,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/;
+ specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/;
+ specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
+ specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
+ specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_h_predictor_16x16 neon/;
+ specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
+ specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
@@ -284,30 +308,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/;
+ specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/;
+ specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
+ specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
+ specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_h_predictor_32x32 neon/;
+ specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
+ specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
@@ -319,81 +347,81 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/;
+ specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/;
+ specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;
+ specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
} # CONFIG_VP9_HIGHBITDEPTH
#
# Sub Pixel Filters
#
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
-add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_2d ssse3 neon msa/;
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Sub Pixel Filters
#
- add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
- add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
- add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
} # CONFIG_VP9_HIGHBITDEPTH
@@ -487,28 +515,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_fdct4x4 neon sse2/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4_1 sse2/;
+ specialize qw/vpx_fdct4x4_1 sse2 neon/;
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct8x8 neon sse2/;
add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct8x8_1 neon sse2/;
+ specialize qw/vpx_fdct8x8_1 neon sse2 msa/;
add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct16x16 sse2/;
+ specialize qw/vpx_fdct16x16 neon sse2/;
add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct16x16_1 sse2/;
+ specialize qw/vpx_fdct16x16_1 sse2 neon/;
add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32 sse2/;
+ specialize qw/vpx_fdct32x32 neon sse2/;
add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32_rd sse2/;
+ specialize qw/vpx_fdct32x32_rd neon sse2/;
add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32_1 sse2/;
+ specialize qw/vpx_fdct32x32_1 sse2 neon/;
add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_highbd_fdct4x4 sse2/;
@@ -517,6 +545,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_fdct8x8 sse2/;
add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct8x8_1 neon/;
+ $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_highbd_fdct16x16 sse2/;
@@ -535,7 +565,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_fdct4x4 neon sse2 msa/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4_1 sse2/;
+ specialize qw/vpx_fdct4x4_1 sse2 neon/;
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
@@ -544,19 +574,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct16x16 sse2 msa/;
+ specialize qw/vpx_fdct16x16 neon sse2 msa/;
add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct16x16_1 sse2 msa/;
+ specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+ specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
+ specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa/;
add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32_1 sse2 msa/;
+ specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER
@@ -581,25 +611,24 @@ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest,
add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
- # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH is off.
- specialize qw/vpx_idct4x4_16_add neon sse2/;
+ # Note that there are more specializations appended when
+ # CONFIG_VP9_HIGHBITDEPTH is off.
+ specialize qw/vpx_idct4x4_16_add neon sse2 vsx/;
specialize qw/vpx_idct4x4_1_add neon sse2/;
- specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/;
+ specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
specialize qw/vpx_idct8x8_1_add neon sse2/;
- specialize qw/vpx_idct16x16_256_add neon sse2/;
+ specialize qw/vpx_idct16x16_256_add neon sse2 vsx/;
specialize qw/vpx_idct16x16_38_add neon sse2/;
- $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
specialize qw/vpx_idct16x16_10_add neon sse2/;
specialize qw/vpx_idct16x16_1_add neon sse2/;
- specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/;
+ specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/;
specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
- $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
specialize qw/vpx_idct32x32_1_add neon sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
- # Note that these specializations appends to the above ones.
+ # Note that these specializations are appended to the above ones.
specialize qw/vpx_idct4x4_16_add dspr2 msa/;
specialize qw/vpx_idct4x4_1_add dspr2 msa/;
specialize qw/vpx_idct8x8_64_add dspr2 msa/;
@@ -652,16 +681,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
- specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
- specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
- specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
- specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
- specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
- $vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2;
- specialize qw/vpx_highbd_idct16x16_10_add neon sse2/;
- specialize qw/vpx_highbd_idct32x32_1024_add neon/;
- specialize qw/vpx_highbd_idct32x32_135_add neon/;
- specialize qw/vpx_highbd_idct32x32_34_add neon/;
+ specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct32x32_1024_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct32x32_135_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct32x32_34_add neon sse2 sse4_1/;
} # !CONFIG_EMULATE_HARDWARE
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9
@@ -671,10 +699,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+ specialize qw/vpx_quantize_b neon sse2 ssse3 avx/;
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+ specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -690,49 +718,49 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
# Block subtraction
#
add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa sse2/;
+specialize qw/vpx_subtract_block neon msa mmi sse2/;
#
# Single block SAD
#
add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa sse2 vsx/;
+specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa sse2 vsx/;
+specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa sse2 vsx/;
+specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa sse2 vsx/;
+specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx/;
+specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2 vsx/;
+specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 neon msa sse2/;
+specialize qw/vpx_sad8x16 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2/;
+specialize qw/vpx_sad8x8 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 msa sse2/;
+specialize qw/vpx_sad8x4 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa sse2/;
+specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 neon msa sse2/;
+specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
#
# Avg
@@ -748,23 +776,23 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_minmax_8x8 sse2 neon msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
- add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon vsx/;
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon/;
+ specialize qw/vpx_satd avx2 sse2 neon/;
} else {
- add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
- add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/;
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon msa/;
+ specialize qw/vpx_satd avx2 sse2 neon msa/;
}
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
@@ -778,138 +806,120 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
} # CONFIG_VP9_ENCODER
add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa sse2/;
+specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa sse2/;
+specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa sse2/;
+specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa sse2/;
+specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa sse2/;
+specialize qw/vpx_sad8x8_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa sse2/;
+specialize qw/vpx_sad8x4_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa sse2/;
+specialize qw/vpx_sad4x8_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa sse2/;
+specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
#
# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
#
# Blocks of 3
-add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x3 msa/;
-
-add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x3 msa/;
-
add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3 msa/;
+specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
+specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
+specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
# Blocks of 8
-add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x8 msa/;
-
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 msa/;
-
add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1 msa/;
+specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1 msa/;
+specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x8 msa/;
-
-add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x8 msa/;
+specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx2 neon msa sse2/;
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa sse2/;
+specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa sse2/;
+specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2/;
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa sse2/;
+specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa sse2/;
+specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa sse2/;
+specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa sse2/;
+specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa sse2/;
+specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa sse2/;
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa sse2/;
+specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa sse2/;
+specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa sse2/;
+specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;
@@ -1016,43 +1026,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
#
- # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
- #
- # Blocks of 3
- add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- # Blocks of 8
- add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
- #
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
@@ -1109,43 +1082,43 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "
# Variance
#
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x64 sse2 neon msa/;
+ specialize qw/vpx_variance32x64 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x32 sse2 neon msa/;
+ specialize qw/vpx_variance16x32 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x8 sse2 neon msa/;
+ specialize qw/vpx_variance16x8 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x16 sse2 neon msa/;
+ specialize qw/vpx_variance8x16 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x8 sse2 neon msa/;
+ specialize qw/vpx_variance8x8 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x4 sse2 neon msa/;
+ specialize qw/vpx_variance8x4 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance4x8 sse2 neon msa/;
+ specialize qw/vpx_variance4x8 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance4x4 sse2 neon msa/;
+ specialize qw/vpx_variance4x4 sse2 neon msa mmi/;
#
# Specialty Variance
@@ -1157,16 +1130,16 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
specialize qw/vpx_get8x8var sse2 neon msa/;
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse16x16 sse2 avx2 neon msa/;
+ specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse16x8 sse2 msa/;
+ specialize qw/vpx_mse16x8 sse2 msa mmi/;
add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse8x16 sse2 msa/;
+ specialize qw/vpx_mse8x16 sse2 msa mmi/;
add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse8x8 sse2 msa/;
+ specialize qw/vpx_mse8x8 sse2 msa mmi/;
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
specialize qw/vpx_get_mb_ss sse2 msa vsx/;
@@ -1175,88 +1148,88 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
specialize qw/vpx_get4x4sse_cs neon msa vsx/;
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
- specialize qw/vpx_comp_avg_pred sse2 vsx/;
+ specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
#
# Subpixel Variance
#
add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
diff --git a/libvpx/vpx_dsp/vpx_filter.h b/libvpx/vpx_dsp/vpx_filter.h
index 26d690501..6cea251bc 100644
--- a/libvpx/vpx_dsp/vpx_filter.h
+++ b/libvpx/vpx_dsp/vpx_filter.h
@@ -26,17 +26,6 @@ extern "C" {
typedef int16_t InterpKernel[SUBPEL_TAPS];
-static INLINE const InterpKernel *get_filter_base(const int16_t *filter) {
- // NOTE: This assumes that the filter table is 256-byte aligned.
- // TODO(agrange) Modify to make independent of table alignment.
- return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static INLINE int get_filter_offset(const int16_t *f,
- const InterpKernel *base) {
- return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/libvpx/vpx_dsp/x86/add_noise_sse2.asm
index f758da22d..80cced4ce 100644
--- a/libvpx/vpx_dsp/x86/add_noise_sse2.asm
+++ b/libvpx/vpx_dsp/x86/add_noise_sse2.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise,
; int blackclamp, int whiteclamp,
; int width, int height, int pitch)
@@ -26,13 +28,13 @@ sym(vpx_plane_add_noise_sse2):
mov rdx, 0x01010101
mov rax, arg(2)
mul rdx
- movd xmm3, rax
+ movq xmm3, rax
pshufd xmm3, xmm3, 0 ; xmm3 is 16 copies of char in blackclamp
mov rdx, 0x01010101
mov rax, arg(3)
mul rdx
- movd xmm4, rax
+ movq xmm4, rax
pshufd xmm4, xmm4, 0 ; xmm4 is 16 copies of char in whiteclamp
movdqu xmm5, xmm3 ; both clamp = black clamp + white clamp
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 000000000..ff19ea647
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_ports/mem.h"
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi16(a0, a1);
+ __m256i b1 = _mm256_sub_epi16(a0, a1);
+ __m256i b2 = _mm256_add_epi16(a2, a3);
+ __m256i b3 = _mm256_sub_epi16(a2, a3);
+ __m256i b4 = _mm256_add_epi16(a4, a5);
+ __m256i b5 = _mm256_sub_epi16(a4, a5);
+ __m256i b6 = _mm256_add_epi16(a6, a7);
+ __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+ a0 = _mm256_add_epi16(b0, b2);
+ a1 = _mm256_add_epi16(b1, b3);
+ a2 = _mm256_sub_epi16(b0, b2);
+ a3 = _mm256_sub_epi16(b1, b3);
+ a4 = _mm256_add_epi16(b4, b6);
+ a5 = _mm256_add_epi16(b5, b7);
+ a6 = _mm256_sub_epi16(b4, b6);
+ a7 = _mm256_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi16(a0, a4);
+ b7 = _mm256_add_epi16(a1, a5);
+ b3 = _mm256_add_epi16(a2, a6);
+ b4 = _mm256_add_epi16(a3, a7);
+ b2 = _mm256_sub_epi16(a0, a4);
+ b6 = _mm256_sub_epi16(a1, a5);
+ b1 = _mm256_sub_epi16(a2, a6);
+ b5 = _mm256_sub_epi16(a3, a7);
+
+ a0 = _mm256_unpacklo_epi16(b0, b1);
+ a1 = _mm256_unpacklo_epi16(b2, b3);
+ a2 = _mm256_unpackhi_epi16(b0, b1);
+ a3 = _mm256_unpackhi_epi16(b2, b3);
+ a4 = _mm256_unpacklo_epi16(b4, b5);
+ a5 = _mm256_unpacklo_epi16(b6, b7);
+ a6 = _mm256_unpackhi_epi16(b4, b5);
+ a7 = _mm256_unpackhi_epi16(b6, b7);
+
+ b0 = _mm256_unpacklo_epi32(a0, a1);
+ b1 = _mm256_unpacklo_epi32(a4, a5);
+ b2 = _mm256_unpackhi_epi32(a0, a1);
+ b3 = _mm256_unpackhi_epi32(a4, a5);
+ b4 = _mm256_unpacklo_epi32(a2, a3);
+ b5 = _mm256_unpacklo_epi32(a6, a7);
+ b6 = _mm256_unpackhi_epi32(a2, a3);
+ b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm256_unpacklo_epi64(b0, b1);
+ in[1] = _mm256_unpackhi_epi64(b0, b1);
+ in[2] = _mm256_unpacklo_epi64(b2, b3);
+ in[3] = _mm256_unpackhi_epi64(b2, b3);
+ in[4] = _mm256_unpacklo_epi64(b4, b5);
+ in[5] = _mm256_unpackhi_epi64(b4, b5);
+ in[6] = _mm256_unpacklo_epi64(b6, b7);
+ in[7] = _mm256_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm256_add_epi16(a0, a4);
+ in[7] = _mm256_add_epi16(a1, a5);
+ in[3] = _mm256_add_epi16(a2, a6);
+ in[4] = _mm256_add_epi16(a3, a7);
+ in[2] = _mm256_sub_epi16(a0, a4);
+ in[6] = _mm256_sub_epi16(a1, a5);
+ in[1] = _mm256_sub_epi16(a2, a6);
+ in[5] = _mm256_sub_epi16(a3, a7);
+ }
+}
+
+static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ __m256i src[8];
+ src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+ src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+
+ hadamard_col8x2_avx2(src, 0);
+ hadamard_col8x2_avx2(src, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+
+ for (idx = 0; idx < 2; ++idx) {
+ int16_t const *src_ptr = src_diff + idx * 8 * src_stride;
+ hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+ }
+
+ for (idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 16) {
+ const __m256i src_line = load_tran_low(coeff);
+ const __m256i abs = _mm256_abs_epi16(src_line);
+ const __m256i sum = _mm256_madd_epi16(abs, one);
+ accum = _mm256_add_epi32(accum, sum);
+ coeff += 16;
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index 4e89e07e5..a235ba41d 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -214,7 +214,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
}
}
-void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
__m128i src[8];
src[0] = _mm_load_si128((const __m128i *)src_diff);
@@ -246,7 +246,7 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
store_tran_low(src[7], coeff);
}
-void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
diff --git a/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
index b9116f049..3552c07cd 100644
--- a/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
+++ b/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -21,10 +21,24 @@
static INLINE __m256i load_tran_low(const tran_low_t *a) {
#if CONFIG_VP9_HIGHBITDEPTH
const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
- return _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+ const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+ return _mm256_packs_epi32(a_low, a_high);
#else
return _mm256_loadu_si256((const __m256i *)a);
#endif
}
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+ const __m256i a_lo = _mm256_mullo_epi16(a, one);
+ const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+ const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+ _mm256_storeu_si256((__m256i *)b, a_1);
+ _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+#else
+ _mm256_storeu_si256((__m256i *)b, a);
+#endif
+}
#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/libvpx/vpx_dsp/x86/convolve.h b/libvpx/vpx_dsp/x86/convolve.h
index e69d6c617..68d7589d4 100644
--- a/libvpx/vpx_dsp/x86/convolve.h
+++ b/libvpx/vpx_dsp/x86/convolve.h
@@ -20,14 +20,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
uint8_t *output_ptr, ptrdiff_t out_pitch,
uint32_t output_height, const int16_t *filter);
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \
void vpx_convolve8_##name##_##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h) { \
- (void)filter_x; \
+ ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ const int16_t *filter = filter_kernel[offset]; \
+ (void)x0_q4; \
(void)x_step_q4; \
- (void)filter_y; \
+ (void)y0_q4; \
(void)y_step_q4; \
assert(filter[3] != 128); \
assert(step_q4 == 16); \
@@ -64,32 +65,36 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
} \
}
-#define FUN_CONV_2D(avg, opt) \
- void vpx_convolve8_##avg##opt( \
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h) { \
- assert(filter_x[3] != 128); \
- assert(filter_y[3] != 128); \
- assert(w <= 64); \
- assert(h <= 64); \
- assert(x_step_q4 == 16); \
- assert(y_step_q4 == 16); \
- if (filter_x[0] | filter_x[1] | filter_x[2]) { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
- vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, \
- h + 7); \
- vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } else { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
- vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h + 1); \
- vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h); \
- } \
+#define FUN_CONV_2D(avg, opt) \
+ void vpx_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ const int16_t *filter_x = filter[x0_q4]; \
+ const int16_t *filter_y = filter[y0_q4]; \
+ (void)filter_y; \
+ assert(filter_x[3] != 128); \
+ assert(filter_y[3] != 128); \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ assert(x_step_q4 == 16); \
+ assert(y_step_q4 == 16); \
+ if (filter_x[0] | filter_x[1] | filter_x[2]) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+ h + 7); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \
+ x_step_q4, y0_q4, y_step_q4, w, h + 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+ h); \
+ } \
}
#if CONFIG_VP9_HIGHBITDEPTH
@@ -101,95 +106,97 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
unsigned int output_height,
const int16_t *filter, int bd);
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vpx_highbd_convolve8_##name##_##opt( \
- const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- if (filter[0] | filter[1] | filter[2]) { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h, bd); \
- } \
- }
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
- void vpx_highbd_convolve8_##avg##opt( \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \
+ void vpx_highbd_convolve8_##name##_##opt( \
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
- fdata2, 64, filter_x, x_step_q4, \
- filter_y, y_step_q4, w, h + 7, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt( \
- fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h, bd); \
+ ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter = filter_kernel[offset]; \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
} else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h + 1, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h, bd); \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
} \
- } else { \
- vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h, bd); \
} \
+ if (w) { \
+ vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_kernel, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h, bd); \
+ } \
+ }
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+ void vpx_highbd_convolve8_##avg##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter_x = filter[x0_q4]; \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ fdata2, 64, filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h + 7, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+ vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, \
+ w, h + 1, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h, bd); \
+ } \
+ } else { \
+ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \
+ bd); \
+ } \
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/x86/convolve_avx2.h b/libvpx/vpx_dsp/x86/convolve_avx2.h
new file mode 100644
index 000000000..bc96b738f
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_
+#define VPX_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_config.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) || \
+ (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && defined(__apple_build_version__) && \
+ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+ (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // gcc <= 4.6
+#else // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // __clang__
+
+static INLINE void shuffle_filter_avx2(const int16_t *const filter,
+ __m256i *const f) {
+ const __m256i f_values =
+ MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
+ // pack and duplicate the filter values
+ f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
+ f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
+ f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
+ f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
+ const __m256i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m256i k_64 = _mm256_set1_epi16(1 << 6);
+ const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
+ const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
+ const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
+ const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
+ __m256i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm256_add_epi16(x0, x2);
+ sum2 = _mm256_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm256_add_epi16(sum1, k_64);
+ sum1 = _mm256_adds_epi16(sum1, sum2);
+ // round and shift by 7 bit each 16 bit
+ sum1 = _mm256_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
+ const __m256i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
+ _mm256_castsi256_si128(f[0]));
+ const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
+ _mm256_castsi256_si128(f[1]));
+ const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
+ _mm256_castsi256_si128(f[2]));
+ const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
+ _mm256_castsi256_si128(f[3]));
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+#undef MM256_BROADCASTSI128_SI256
+
+#endif // VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/libvpx/vpx_dsp/x86/convolve_ssse3.h b/libvpx/vpx_dsp/x86/convolve_ssse3.h
new file mode 100644
index 000000000..e5d452f99
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#define VPX_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <assert.h>
+#include <tmmintrin.h> // SSSE3
+
+#include "./vpx_config.h"
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+ // half of f[0] and f[4].
+ assert(filter[3] >= 0 && filter[3] < 256);
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+ f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ // compensate the subtracted 64 in f[1]. x4 is always non negative.
+ const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+ // add and saturate the results together
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x4);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+ // compensate the subtracted 64 in f[2]. x5 is always non negative.
+ const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+ __m128i temp;
+
+ // add and saturate the results together
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x3);
+ temp = _mm_adds_epi16(temp, x4);
+ temp = _mm_adds_epi16(temp, x5);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+#endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/libvpx/vpx_dsp/x86/deblock_sse2.asm b/libvpx/vpx_dsp/x86/deblock_sse2.asm
index bd8fd1248..97cb43b67 100644
--- a/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -83,6 +83,8 @@
add rbx, 16
%endmacro
+SECTION .text
+
;void vpx_post_proc_down_and_across_mb_row_sse2
;(
; unsigned char *src_ptr,
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 39d3a3f59..132e06523 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
const __m256i k__cospi_p16_m16 =
pair256_set_epi16(+cospi_16_64, -cospi_16_64);
const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index 374433390..32b9bd281 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -63,7 +63,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 743e55e63..f9abaecf2 100644
--- a/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index b433874f2..32824a03a 100644
--- a/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -31,8 +31,8 @@ SECTION .text
INIT_XMM ssse3
cglobal fdct8x8, 3, 5, 13, input, output, stride
- mova m8, [pd_8192]
- mova m12, [pw_11585x2]
+ mova m8, [GLOBAL(pd_8192)]
+ mova m12, [GLOBAL(pw_11585x2)]
lea r3, [2 * strideq]
lea r4, [4 * strideq]
@@ -92,10 +92,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
; sin(pi / 8), cos(pi / 8)
punpcklwd m2, m10, m9
punpckhwd m10, m9
- pmaddwd m5, m2, [pw_15137_6270]
- pmaddwd m2, [pw_6270_m15137]
- pmaddwd m9, m10, [pw_15137_6270]
- pmaddwd m10, [pw_6270_m15137]
+ pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+ pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+ pmaddwd m10, [GLOBAL(pw_6270_m15137)]
paddd m5, m8
paddd m2, m8
paddd m9, m8
@@ -120,10 +120,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
; sin(pi / 16), cos(pi / 16)
punpcklwd m1, m10, m9
punpckhwd m10, m9
- pmaddwd m7, m1, [pw_16069_3196]
- pmaddwd m1, [pw_3196_m16069]
- pmaddwd m9, m10, [pw_16069_3196]
- pmaddwd m10, [pw_3196_m16069]
+ pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+ pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+ pmaddwd m10, [GLOBAL(pw_3196_m16069)]
paddd m7, m8
paddd m1, m8
paddd m9, m8
@@ -138,10 +138,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
; sin(3 * pi / 16), cos(3 * pi / 16)
punpcklwd m11, m0, m3
punpckhwd m0, m3
- pmaddwd m9, m11, [pw_9102_13623]
- pmaddwd m11, [pw_13623_m9102]
- pmaddwd m3, m0, [pw_9102_13623]
- pmaddwd m0, [pw_13623_m9102]
+ pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+ pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+ pmaddwd m0, [GLOBAL(pw_13623_m9102)]
paddd m9, m8
paddd m11, m8
paddd m3, m8
@@ -211,10 +211,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
; stage 3
punpcklwd m6, m1, m3
punpckhwd m1, m3
- pmaddwd m2, m6, [pw_11585_11585]
- pmaddwd m6, [pw_11585_m11585]
- pmaddwd m3, m1, [pw_11585_11585]
- pmaddwd m1, [pw_11585_m11585]
+ pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+ pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+ pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+ pmaddwd m1, [GLOBAL(pw_11585_m11585)]
paddd m2, m8
paddd m6, m8
paddd m3, m8
@@ -231,10 +231,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
punpcklwd m3, m5, m4
punpckhwd m5, m4
- pmaddwd m1, m3, [pw_15137_6270]
- pmaddwd m3, [pw_6270_m15137]
- pmaddwd m4, m5, [pw_15137_6270]
- pmaddwd m5, [pw_6270_m15137]
+ pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+ pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+ pmaddwd m5, [GLOBAL(pw_6270_m15137)]
paddd m1, m8
paddd m3, m8
paddd m4, m8
@@ -255,10 +255,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
; stage 4
punpcklwd m9, m5, m4
punpckhwd m5, m4
- pmaddwd m7, m9, [pw_16069_3196]
- pmaddwd m9, [pw_3196_m16069]
- pmaddwd m4, m5, [pw_16069_3196]
- pmaddwd m5, [pw_3196_m16069]
+ pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+ pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+ pmaddwd m5, [GLOBAL(pw_3196_m16069)]
paddd m7, m8
paddd m9, m8
paddd m4, m8
@@ -272,10 +272,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
punpcklwd m4, m10, m0
punpckhwd m10, m0
- pmaddwd m5, m4, [pw_9102_13623]
- pmaddwd m4, [pw_13623_m9102]
- pmaddwd m0, m10, [pw_9102_13623]
- pmaddwd m10, [pw_13623_m9102]
+ pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+ pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+ pmaddwd m10, [GLOBAL(pw_13623_m9102)]
paddd m5, m8
paddd m4, m8
paddd m0, m8
diff --git a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
index 2fc7b7430..7e75d5d10 100644
--- a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -18,13 +18,14 @@
void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int width, int h, int bd) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
(void)bd;
assert(width % 4 == 0);
@@ -99,13 +100,14 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
int width, int h, int bd) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
(void)bd;
assert(width % 4 == 0);
@@ -1073,8 +1075,8 @@ void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
HIGH_FUN_CONV_2D(, avx2);
void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
@@ -1098,8 +1100,8 @@ void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
vpx_highbd_filter_block1d4_v2_avg_sse2
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
avx2);
HIGH_FUN_CONV_2D(avg_, avx2);
diff --git a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
index a2412d124..f4f7235d1 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -8,237 +8,343 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
+
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+ __m128i *const out) {
+ // stage 5
+ out[0] = _mm_add_epi32(in[0], in[3]);
+ out[1] = _mm_add_epi32(in[1], in[2]);
+ out[2] = _mm_sub_epi32(in[1], in[2]);
+ out[3] = _mm_sub_epi32(in[0], in[3]);
+ highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
+ out[8] = _mm_add_epi32(in[8], in[11]);
+ out[9] = _mm_add_epi32(in[9], in[10]);
+ out[10] = _mm_sub_epi32(in[9], in[10]);
+ out[11] = _mm_sub_epi32(in[8], in[11]);
+ out[12] = _mm_sub_epi32(in[15], in[12]);
+ out[13] = _mm_sub_epi32(in[14], in[13]);
+ out[14] = _mm_add_epi32(in[14], in[13]);
+ out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+ out[8] = in[8];
+ out[9] = in[9];
+ highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
+ out[14] = in[14];
+ out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
+ highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp1[2], sign[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ abs_extend_64bit_sse2(io[0], temp1, sign);
+ step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ step2[1] = step2[0];
+ highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp[2], sign[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ abs_extend_64bit_sse2(io[0], temp, sign);
+ step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
+ step2[1] = step2[0];
+ step2[2] = _mm_setzero_si128();
+ step2[3] = _mm_setzero_si128();
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
+ int i;
+ __m128i out[16], *in;
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
}
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_16x16(inptr, inptr + 16);
- for (i = 0; i < 16; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
}
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
+ dest += 8;
}
} else {
- // Run the un-optimised row transform
- for (i = 0; i < 16; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
+ __m128i all[4][16];
- if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
- }
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ highbd_idct16_4col(in);
+ input += 4 * 16;
}
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct16_4col(out);
+
for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
+ dest += 4;
}
}
}
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
+ int i;
+ __m128i out[16];
- // Find the min & max for the row transform
- // Since all non-zero dct coefficients are in upper-left 4x4 area,
- // we only need to consider first 4 rows here.
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform (N.B. This transposes inptr)
- idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 16; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
+ if (bd == 8) {
+ __m128i in[16], temp[16];
+
+ highbd_load_pack_transpose_32bit_8x8(input, 16, in);
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
}
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_8x8(inptr, inptr);
- array_transpose_8x8(inptr + 8, inptr + 16);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
}
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
+ dest += 8;
}
} else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(input, 16, in);
+ highbd_idct16x16_38_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ highbd_idct16x16_38_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
}
}
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], l[16];
+
+ in[0] = load_pack_8_32bit(input + 0 * 16);
+ in[1] = load_pack_8_32bit(input + 1 * 16);
+ in[2] = load_pack_8_32bit(input + 2 * 16);
+ in[3] = load_pack_8_32bit(input + 3 * 16);
- if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
+ idct16x16_10_pass1(in, l);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
}
+ dest += 8;
}
} else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_4x4(input, 16, in);
+ highbd_idct16x16_10_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(&all[0][i], out);
+ highbd_idct16x16_10_4col(out);
+
for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
+ dest += 4;
}
}
}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
new file mode 100644
index 000000000..de097c66a
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+ __m128i *const out) {
+ // stage 5
+ out[0] = _mm_add_epi32(in[0], in[3]);
+ out[1] = _mm_add_epi32(in[1], in[2]);
+ out[2] = _mm_sub_epi32(in[1], in[2]);
+ out[3] = _mm_sub_epi32(in[0], in[3]);
+ highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
+ out[8] = _mm_add_epi32(in[8], in[11]);
+ out[9] = _mm_add_epi32(in[9], in[10]);
+ out[10] = _mm_sub_epi32(in[9], in[10]);
+ out[11] = _mm_sub_epi32(in[8], in[11]);
+ out[12] = _mm_sub_epi32(in[15], in[12]);
+ out[13] = _mm_sub_epi32(in[14], in[13]);
+ out[14] = _mm_add_epi32(in[14], in[13]);
+ out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+ out[8] = in[8];
+ out[9] = in[9];
+ highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
+ out[14] = in[14];
+ out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
+ highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp1[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ extend_64bit(io[0], temp1);
+ step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ step2[1] = step2[0];
+ highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ extend_64bit(io[0], temp);
+ step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ step2[1] = step2[0];
+ step2[2] = _mm_setzero_si128();
+ step2[3] = _mm_setzero_si128();
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ highbd_idct16_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct16_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], temp[16];
+
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(input, 16, in);
+ highbd_idct16x16_38_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ highbd_idct16x16_38_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], l[16];
+
+ in[0] = load_pack_8_32bit(input + 0 * 16);
+ in[1] = load_pack_8_32bit(input + 1 * 16);
+ in[2] = load_pack_8_32bit(input + 2 * 16);
+ in[3] = load_pack_8_32bit(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_4x4(input, 16, in);
+ highbd_idct16x16_10_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(&all[0][i], out);
+ highbd_idct16x16_10_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
index 06f265918..c710e8995 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -14,6 +14,768 @@
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi32(step2[8], step2[11]);
+ step1[9] = _mm_add_epi32(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+ step1[14] = _mm_add_epi32(step2[14], step2[13]);
+ step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64,
+ &out[10], &out[13]);
+ highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64,
+ &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi32(step1[16], step1[19]);
+ step2[17] = _mm_add_epi32(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi32(step1[20], step1[23]); // step2[20] = -step2[20]
+ step2[21] = _mm_sub_epi32(step1[21], step1[22]); // step2[21] = -step2[21]
+ step2[22] = _mm_add_epi32(step1[21], step1[22]);
+ step2[23] = _mm_add_epi32(step1[20], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[27], step1[24]);
+ step2[25] = _mm_add_epi32(step1[26], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[26], step1[25]); // step2[26] = -step2[26]
+ step2[27] = _mm_sub_epi32(step1[27], step1[24]); // step2[27] = -step2[27]
+ step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+ step2[30] = _mm_add_epi32(step1[29], step1[30]);
+ step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64,
+ &step1[18], &step1[29]);
+ highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64,
+ &step1[19], &step1[28]);
+ highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64,
+ &step1[27], &step1[20]);
+ highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64,
+ &step1[26], &step1[21]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[16] = _mm_add_epi32(step1[16], step1[23]);
+ step2[17] = _mm_add_epi32(step1[17], step1[22]);
+ step2[18] = _mm_add_epi32(step1[18], step1[21]);
+ step2[19] = _mm_add_epi32(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+ step2[28] = _mm_add_epi32(step1[27], step1[28]);
+ step2[29] = _mm_add_epi32(step1[26], step1[29]);
+ step2[30] = _mm_add_epi32(step1[25], step1[30]);
+ step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+ // stage 7
+ out[16] = step2[16];
+ out[17] = step2[17];
+ out[18] = step2[18];
+ out[19] = step2[19];
+ highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64,
+ &out[20], &out[27]);
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64,
+ &out[21], &out[26]);
+ highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64,
+ &out[22], &out[25]);
+ highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64,
+ &out[23], &out[24]);
+ out[28] = step2[28];
+ out[29] = step2[29];
+ out[30] = step2[30];
+ out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_1024_4x32_quarter_1(in, temp);
+ highbd_idct32_1024_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+ &step1[30]);
+ highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+ &step1[26]);
+
+ highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18]
+ step2[19] = _mm_add_epi32(step1[18], step1[19]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22]
+ step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[25], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25]
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[29], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29]
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+ highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[4][32], io[32];
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+ highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+ highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+ idct32_1024_8x32(io, io);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, io[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 8; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+ highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+ highbd_idct32_1024_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ transpose_32bit_4x4(all[4] + i, out + 16);
+ transpose_32bit_4x4(all[5] + i, out + 20);
+ transpose_32bit_4x4(all[6] + i, out + 24);
+ transpose_32bit_4x4(all[7] + i, out + 28);
+ highbd_idct32_1024_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_135_4x32_quarter_1(in, temp);
+ highbd_idct32_135_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64,
+ &step1[17], &step1[30]);
+ highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64,
+ &step1[21], &step1[26]);
+
+ highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18]
+ step2[19] = _mm_add_epi32(step1[18], step1[19]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22]
+ step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[25], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25]
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[29], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29]
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_135_4x32_quarter_1_2(io, temp);
+ highbd_idct32_135_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[2][32], in[32], out[32];
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+ idct32_1024_8x32(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_1024_8x32(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_135_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_135_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+
+ // stage 4
+ highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ step1[10] =
+ _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10]
+ step1[13] =
+ _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13]
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_34_4x32_quarter_1(in, temp);
+ highbd_idct32_34_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+
+ // stage 3
+ step2[18] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[18]); // step2[18] = -step2[18]
+ step2[22] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[22]); // step2[22] = -step2[22]
+ step2[25] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[25]); // step2[25] = -step2[25]
+ step2[29] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[29]); // step2[29] = -step2[29]
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_34_4x32_quarter_1_2(io, temp);
+ highbd_idct32_34_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[32], in[32], out[32];
+
+ // rows
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ idct32_34_8x32_sse2(in, col);
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col + i, in);
+ idct32_34_8x32_sse2(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_34_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_34_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
diff --git a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
new file mode 100644
index 000000000..2d0a53ac0
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
+ &step2[10], &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi32(step2[8], step2[11]);
+ step1[9] = _mm_add_epi32(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+ step1[14] = _mm_add_epi32(step2[14], step2[13]);
+ step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
+ &out[10], &out[13]);
+ highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
+ &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi32(step1[16], step1[19]);
+ step2[17] = _mm_add_epi32(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi32(step1[23], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[22], step1[21]);
+ step2[22] = _mm_add_epi32(step1[22], step1[21]);
+ step2[23] = _mm_add_epi32(step1[23], step1[20]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[27]);
+ step2[25] = _mm_add_epi32(step1[25], step1[26]);
+ step2[26] = _mm_sub_epi32(step1[25], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[24], step1[27]);
+ step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+ step2[30] = _mm_add_epi32(step1[29], step1[30]);
+ step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
+ &step1[18], &step1[29]);
+ highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
+ &step1[19], &step1[28]);
+ highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
+ &step1[20], &step1[27]);
+ highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
+ &step1[21], &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[16] = _mm_add_epi32(step1[16], step1[23]);
+ step2[17] = _mm_add_epi32(step1[17], step1[22]);
+ step2[18] = _mm_add_epi32(step1[18], step1[21]);
+ step2[19] = _mm_add_epi32(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+ step2[28] = _mm_add_epi32(step1[27], step1[28]);
+ step2[29] = _mm_add_epi32(step1[26], step1[29]);
+ step2[30] = _mm_add_epi32(step1[25], step1[30]);
+ step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+ // stage 7
+ out[16] = step2[16];
+ out[17] = step2[17];
+ out[18] = step2[18];
+ out[19] = step2[19];
+ highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
+ &out[20], &out[27]);
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
+ &out[21], &out[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
+ &out[22], &out[25]);
+ highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
+ &out[23], &out[24]);
+ out[28] = step2[28];
+ out[29] = step2[29];
+ out[30] = step2[30];
+ out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_1024_4x32_quarter_1(in, temp);
+ highbd_idct32_1024_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+ &step1[30]);
+ highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+ &step1[26]);
+
+ highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+ step2[19] = _mm_add_epi32(step1[19], step1[18]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+ step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+ highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[4][32], io[32];
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+ highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+ highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+ idct32_1024_8x32(io, io);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, io[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 8; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+ highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+ highbd_idct32_1024_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ transpose_32bit_4x4(all[4] + i, out + 16);
+ transpose_32bit_4x4(all[5] + i, out + 20);
+ transpose_32bit_4x4(all[6] + i, out + 24);
+ transpose_32bit_4x4(all[7] + i, out + 28);
+ highbd_idct32_1024_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_135_4x32_quarter_1(in, temp);
+ highbd_idct32_135_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+ &step1[30]);
+ highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+ &step1[26]);
+
+ highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+ step2[19] = _mm_add_epi32(step1[19], step1[18]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+ step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_135_4x32_quarter_1_2(io, temp);
+ highbd_idct32_135_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[2][32], in[32], out[32];
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+ idct32_135_8x32_ssse3(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_135_8x32_ssse3(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_135_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_135_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+
+ // stage 4
+ highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_34_4x32_quarter_1(in, temp);
+ highbd_idct32_34_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_34_4x32_quarter_1_2(io, temp);
+ highbd_idct32_34_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[32], in[32], out[32];
+
+ // rows
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ idct32_34_8x32_ssse3(in, col);
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col + i, in);
+ idct32_34_8x32_ssse3(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_34_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_34_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
index 89a2584e3..2e54d2473 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -8,144 +8,152 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
+
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+ const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3
+ const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3
+ return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+ const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+ const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+ const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+ __m128i temp1[4], temp2[4], step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+ // _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+ // (signed) result is meaningful, which is enough in this function.
+
+ // stage 1
+ temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3
+ temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3
+ temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ temp1[3] = _mm_srli_si128(io[1], 4);
+ temp2[3] = _mm_srli_si128(io[3], 4);
+ temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64
+ temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64
+ temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64
+ temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64
+ temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8
+ temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8
+ temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
+ temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
+ step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+ __m128i step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
+ highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+ &step[3]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- __m128i inptr[4];
- __m128i sign_bits[2];
- __m128i temp_mm, min_input, max_input;
- int test;
- int optimised_cols = 0;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i max = _mm_set1_epi16(12043);
- const __m128i min = _mm_set1_epi16(-12043);
- // Load input into __m128i
- inptr[0] = _mm_loadu_si128((const __m128i *)input);
- inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
- inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
- inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
- // Pack to 16 bits
- inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
- inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (!test) {
- // Do the row transform
- idct4_sse2(inptr);
-
- // Check the min & max values
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (test) {
- transpose_16bit_4x4(inptr);
- sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
- sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
- inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
- inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
- inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
- inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
- _mm_storeu_si128((__m128i *)outptr, inptr[0]);
- _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
- _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
- _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
+ int16_t max = 0, min = 0;
+ __m128i io[4], io_short[2];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+ if (bd != 8) {
+ __m128i max_input, min_input;
+
+ max_input = _mm_max_epi16(io_short[0], io_short[1]);
+ min_input = _mm_min_epi16(io_short[0], io_short[1]);
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+ max = _mm_extract_epi16(max_input, 0);
+ min = _mm_extract_epi16(min_input, 0);
}
- if (optimised_cols) {
- idct4_sse2(inptr);
-
- // Final round and shift
- inptr[0] = _mm_add_epi16(inptr[0], eight);
- inptr[1] = _mm_add_epi16(inptr[1], eight);
-
- inptr[0] = _mm_srai_epi16(inptr[0], 4);
- inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi64(
- d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
- // store input0
- _mm_storel_epi64((__m128i *)dest, d0);
- // store input1
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- // store input2
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- // store input3
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
- }
+ if (bd == 8 || (max < 4096 && min >= -4096)) {
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
} else {
- // Run the un-optimised column transform
- tran_low_t temp_in[4], temp_out[4];
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- vpx_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
+ if (max < 32767 && min > -32768) {
+ highbd_idct4_small_sse2(io);
+ highbd_idct4_small_sse2(io);
+ } else {
+ highbd_idct4_large_sse2(io);
+ highbd_idct4_large_sse2(io);
}
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
}
+
+ recon_and_store_4x4(io, dest, stride, bd);
}
void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- const __m128i zero = _mm_setzero_si128();
- // Faster than _mm_set1_epi16((1 << bd) - 1).
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
int a1, i;
tran_low_t out;
__m128i dc, d;
- out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
dc = _mm_set1_epi16(a1);
for (i = 0; i < 4; ++i) {
d = _mm_loadl_epi64((const __m128i *)dest);
- d = add_dc_clamp(&zero, &max, &dc, &d);
+ d = add_clamp(d, dc, bd);
_mm_storel_epi64((__m128i *)dest, d);
dest += stride;
}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
new file mode 100644
index 000000000..38e64f3bc
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE void highbd_idct4(__m128i *const io) {
+ __m128i temp[2], step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ extend_64bit(temp[0], temp);
+ step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ extend_64bit(temp[0], temp);
+ step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+ &step[3]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[4];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ if (bd == 8) {
+ __m128i io_short[2];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ highbd_idct4(io);
+ highbd_idct4(io);
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
index 29cc1d30e..909a6b794 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -8,211 +8,203 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
+
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+ __m128i step1[8], step2[8];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[2] = io[4];
+ step1[1] = io[2];
+ step1[3] = io[6];
+ highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 2
+ highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
+ highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+ __m128i temp1[4], sign[2], step1[8], step2[8];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[1] = io[2];
+ abs_extend_64bit_sse2(io[1], temp1, sign);
+ step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64);
+ step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64);
+ abs_extend_64bit_sse2(io[3], temp1, sign);
+ step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64);
+ step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64);
+
+ // stage 2
+ abs_extend_64bit_sse2(step1[0], temp1, sign);
+ step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ abs_extend_64bit_sse2(step1[1], temp1, sign);
+ step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64);
+ step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_8x8(inptr, inptr);
- for (i = 0; i < 8; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ idct8_sse2(io_short);
+ idct8_sse2(io_short);
+ round_shift_8x8(io_short, io);
} else {
- // Run the un-optimised row transform
- for (i = 0; i < 8; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
+ __m128i temp[4];
+
+ highbd_idct8x8_half1d(io);
+
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ highbd_idct8x8_half1d(&io[8]);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+ highbd_idct8x8_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
}
- if (optimised_cols) {
- idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
+ recon_and_store_8x8(io, dest, stride, bd);
}
void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
+ const __m128i zero = _mm_setzero_si128();
+ __m128i io[16];
- // Find the min & max for the row transform
- // only first 4 row has non-zero coefs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_4X8(inptr, inptr);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], zero);
+ io_short[1] = _mm_packs_epi32(io[1], zero);
+ io_short[2] = _mm_packs_epi32(io[2], zero);
+ io_short[3] = _mm_packs_epi32(io[3], zero);
- if (optimised_cols) {
- idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
+ idct8x8_12_add_kernel_sse2(io_short);
+ round_shift_8x8(io_short, io);
} else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
+ __m128i temp[4];
+
+ highbd_idct8x8_12_half1d(io);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ highbd_idct8x8_12_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_12_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
}
+
+ recon_and_store_8x8(io, dest, stride, bd);
}
void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
diff --git a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
new file mode 100644
index 000000000..ae391b2c0
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+ __m128i step1[8], step2[8];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[2] = io[4];
+ step1[1] = io[2];
+ step1[3] = io[6];
+ highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 2
+ highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
+ highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64,
+ &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+ __m128i temp1[2], step1[8], step2[8];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[1] = io[2];
+ extend_64bit(io[1], temp1);
+ step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64);
+ step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64);
+ extend_64bit(io[3], temp1);
+ step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64);
+ step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64);
+
+ // stage 2
+ extend_64bit(step1[0], temp1);
+ step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ extend_64bit(step1[1], temp1);
+ step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64);
+ step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ idct8_sse2(io_short);
+ idct8_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_half1d(io);
+
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ highbd_idct8x8_half1d(&io[8]);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+ highbd_idct8x8_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], zero);
+ io_short[1] = _mm_packs_epi32(io[1], zero);
+ io_short[2] = _mm_packs_epi32(io[2], zero);
+ io_short[3] = _mm_packs_epi32(io[3], zero);
+
+ idct8x8_12_add_kernel_ssse3(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_12_half1d(io);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ highbd_idct8x8_12_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_12_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
new file mode 100644
index 000000000..2051381aa
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)*dst, val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_16_unpacklo(&dst, stride, &row0);
+ h_store_16_unpacklo(&dst, stride, &row1);
+ h_store_16_unpacklo(&dst, stride, &row2);
+ h_store_16_unpacklo(&dst, stride, &row3);
+ h_store_16_unpackhi(&dst, stride, &row4);
+ h_store_16_unpackhi(&dst, stride, &row5);
+ h_store_16_unpackhi(&dst, stride, &row6);
+ h_store_16_unpackhi(&dst, stride, &row7);
+ }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_32_unpacklo(&dst, stride, &row0);
+ h_store_32_unpacklo(&dst, stride, &row1);
+ h_store_32_unpacklo(&dst, stride, &row2);
+ h_store_32_unpacklo(&dst, stride, &row3);
+ h_store_32_unpackhi(&dst, stride, &row4);
+ h_store_32_unpackhi(&dst, stride, &row5);
+ h_store_32_unpackhi(&dst, stride, &row6);
+ h_store_32_unpackhi(&dst, stride, &row7);
+ }
+}
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+ const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+ const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+ const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)left;
+ (void)bd;
+ dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+ const __m128i sum_lo = dc_sum_8(ref);
+ const __m128i sum_hi = dc_sum_8(ref + 8);
+ return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sum_a = dc_sum_16(ref);
+ const __m128i sum_b = dc_sum_16(ref + 16);
+ // 12 bit bd will outrange, so expand to 32 bit before adding final total
+ return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+ _mm_unpacklo_epi16(sum_b, zero));
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 32; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
+ const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
+ const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
+ const __m128i row0 = _mm_srli_si128(avg2, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg2, 4);
+ const __m128i row3 = _mm_srli_si128(avg3, 2);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+
+ dst -= stride;
+ dst[0] = _mm_extract_epi16(avg3, 1);
+ dst[stride] = _mm_extract_epi16(avg3, 0);
+}
+
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
+ const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
+ const __m128i row0 = _mm_srli_si128(avg3, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg3, 2);
+ const __m128i row3 = avg3;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
+ const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
+ const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
+ const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
+ const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
+ const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
+ const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
+ const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
+ const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i row2 = _mm_srli_si128(row3, 4);
+ const __m128i row1 = _mm_srli_si128(row3, 8);
+ const __m128i row0 = _mm_srli_si128(avg3, 4);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst[0] = _mm_extract_epi16(avg2, 3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
+ const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
+ const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
+ const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
+ const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
+ const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
+ const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
+ const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i row1 = _mm_srli_si128(row0, 4);
+ const __m128i row2 = _mm_srli_si128(row0, 8);
+ const __m128i row3 = LLLL0000;
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
+ const __m128i row0 = avg2;
+ const __m128i row1 = avg3;
+ const __m128i row2 = _mm_srli_si128(avg2, 2);
+ const __m128i row3 = _mm_srli_si128(avg3, 2);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
new file mode 100644
index 000000000..b9dcef205
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
@@ -0,0 +1,930 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+ dst[3] = above[7]; // aka H
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *row, const __m128i *ar) {
+ *row = _mm_alignr_epi8(*ar, *row, 2);
+ _mm_store_si128((__m128i *)*dst, *row);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3);
+ dst += stride;
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *row_0, __m128i *row_1,
+ const __m128i *ar) {
+ *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
+ *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
+ _mm_store_si128((__m128i *)*dst, *row_0);
+ _mm_store_si128((__m128i *)(*dst + 8), *row_1);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ dst += stride;
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+}
+
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ int i;
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ for (i = 1; i < 32; ++i) {
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+ avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+ avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ }
+}
+
+DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
+};
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+ *a = _mm_shuffle_epi8(*a, *rotrw);
+ return *a;
+}
+
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i IXABCDEF =
+ _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+ __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+ __m128i rowa = avg2;
+ __m128i rowb = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; i += 2) {
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb);
+ dst += stride;
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+ }
+}
+
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_srli_si128(L1, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ dst += stride;
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+ const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+ const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+ const __m128i L3_ = _mm_srli_si128(L3, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowa_2 = avg2_2;
+ __m128i rowa_3 = avg2_3;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i rowb_2 = avg3_2;
+ __m128i rowb_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+ avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+ dst += stride;
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+ rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+ const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+ __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ __m128i rowa = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; ++i) {
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_srli_si128(B1, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+ const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+ const __m128i C3 = _mm_srli_si128(B3, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+ const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i rowa_2 = avg3_2;
+ __m128i rowa_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+ avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+ const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+ const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+ const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+ const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+ const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+ const __m128i row0 =
+ _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+ const __m128i row1 =
+ _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+ const __m128i row2 =
+ _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+ const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+ const __m128i row4 =
+ _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+ const __m128i row5 =
+ _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+ const __m128i row6 =
+ _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+ const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, row0);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row2);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row4);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row5);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row6);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row7);
+}
+
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_srli_si128(A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_srli_si128(A1, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i avg2_avg3_left[2][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_srli_si128(A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_srli_si128(A3, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+ const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+ const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+ const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i row_2 = avg3_2;
+ __m128i row_3 = avg3_3;
+ __m128i avg2_avg3_left[4][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+ avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+ for (j = 0; j < 4; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ }
+ }
+}
+
+static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
+ (void)above;
+ (void)bd;
+ d207_store_4x8(&dst, stride, &out_a, &out_b);
+ d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
+}
+
+static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b,
+ const __m128i *c) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ _mm_store_si128((__m128i *)(*dst + 8), *b);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)left);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+ const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+ const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+ (void)above;
+ (void)bd;
+ d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
+ d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
+ d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
+ d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
+}
+
+static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b,
+ const __m128i *c, const __m128i *d,
+ const __m128i *e) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ _mm_store_si128((__m128i *)(*dst + 8), *b);
+ _mm_store_si128((__m128i *)(*dst + 16), *c);
+ _mm_store_si128((__m128i *)(*dst + 24), *d);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)left);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+ const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+ const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+ const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
+ const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
+ const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
+ const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
+ (void)above;
+ (void)bd;
+ d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
+ d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
+ d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
+ d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
+ d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
+ d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
+ d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
+ d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
+}
+
+static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *a, __m128i *b, const __m128i *ar) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, *b);
+ *dst += stride;
+ *a = _mm_alignr_epi8(*ar, *a, 2);
+ *b = _mm_alignr_epi8(*ar, *b, 2);
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, *b);
+ *dst += stride;
+ *a = _mm_alignr_epi8(*ar, *a, 2);
+ *b = _mm_alignr_epi8(*ar, *b, 2);
+}
+
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+ (void)left;
+ (void)bd;
+ d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+ d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+}
+
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ int i;
+ (void)left;
+ (void)bd;
+ for (i = 0; i < 14; i += 2) {
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ dst += stride;
+ avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+ avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
+ }
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+}
+
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ int i;
+ (void)left;
+ (void)bd;
+ for (i = 0; i < 30; i += 2) {
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+ avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
+ avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
+ avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+ avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+ avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+ }
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index ea100c6e1..e0f749552 100644
--- a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -12,59 +12,389 @@
#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
#include <emmintrin.h> // SSE2
+
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-static INLINE __m128i add_dc_clamp(const __m128i *const min,
- const __m128i *const max,
- const __m128i *const dc,
- const __m128i *const in) {
- __m128i out;
- out = _mm_adds_epi16(*in, *dc);
- out = _mm_max_epi16(out, *min);
- out = _mm_min_epi16(out, *max);
- return out;
+static INLINE void extend_64bit(const __m128i in,
+ __m128i *const out /*out[2]*/) {
+ out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1
+ out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3
}
-static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
- uint16_t *dest, int stride, int bd,
- const int size) {
- const __m128i zero = _mm_setzero_si128();
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 4);
+ temp[1] = _mm_srai_epi32(temp[1], 4);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 5);
+ temp[1] = _mm_srai_epi32(temp[1], 5);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+ const __m128i t =
+ _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
+ return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3
+ return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+ __m128i *const out /*out[2]*/,
+ __m128i *const sign /*sign[2]*/) {
+ sign[0] = _mm_srai_epi32(in, 31);
+ out[0] = _mm_xor_si128(in, sign[0]);
+ out[0] = _mm_sub_epi32(out[0], sign[0]);
+ sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3
+ sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1
+ out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3
+ out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1
+}
+
+// Note: cospi must be non negative.
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+ const __m128i sign,
+ const __m128i cospi) {
+ __m128i out = _mm_mul_epu32(in, cospi);
+ out = _mm_xor_si128(out, sign);
+ return _mm_sub_epi64(out, sign);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_round_shift_sse2(
+ const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+ const int c) {
+ const __m128i pair_c = pair_set_epi32(c << 2, 0);
+ __m128i t0, t1;
+
+ assert(c >= 0);
+ t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+ t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_neg_round_shift_sse2(
+ const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+ const int c) {
+ const __m128i pair_c = pair_set_epi32(c << 2, 0);
+ __m128i t0, t1;
+
+ assert(c >= 0);
+ t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+ t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+ t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
+ t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
+ const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
+ __m128i temp1[4], temp2[4], sign1[2], sign2[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in0, temp1, sign1);
+ abs_extend_64bit_sse2(in1, temp2, sign2);
+ temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
+ temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
+ temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
+ temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
+ temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
+ temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
+ temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
+ temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ *out0 = pack_4(temp1[0], temp1[1]);
+ *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
+ const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2], sign[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in, temp, sign);
+ *out0 = multiplication_round_shift_sse2(temp, sign, c0);
+ *out1 = multiplication_round_shift_sse2(temp, sign, c1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2], sign[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in, temp, sign);
+ *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
+ *out1 = multiplication_round_shift_sse2(temp, sign, c0);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2, sign[2];
+
+ temp2 = _mm_add_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi32(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+static INLINE void highbd_idct8_stage4(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+}
+
+static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
+ io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+ io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+ io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+ io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+ io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+ io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+ io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+ io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[15]);
+ out[1] = _mm_add_epi32(in[1], in[14]);
+ out[2] = _mm_add_epi32(in[2], in[13]);
+ out[3] = _mm_add_epi32(in[3], in[12]);
+ out[4] = _mm_add_epi32(in[4], in[11]);
+ out[5] = _mm_add_epi32(in[5], in[10]);
+ out[6] = _mm_add_epi32(in[6], in[9]);
+ out[7] = _mm_add_epi32(in[7], in[8]);
+ out[8] = _mm_sub_epi32(in[7], in[8]);
+ out[9] = _mm_sub_epi32(in[6], in[9]);
+ out[10] = _mm_sub_epi32(in[5], in[10]);
+ out[11] = _mm_sub_epi32(in[4], in[11]);
+ out[12] = _mm_sub_epi32(in[3], in[12]);
+ out[13] = _mm_sub_epi32(in[2], in[13]);
+ out[14] = _mm_sub_epi32(in[1], in[14]);
+ out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+ const int bd) {
+ const __m128i zero = _mm_set1_epi16(0);
// Faster than _mm_set1_epi16((1 << bd) - 1).
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i d;
+
+ d = _mm_adds_epi16(in0, in1);
+ d = _mm_max_epi16(d, zero);
+ d = _mm_min_epi16(d, max);
+
+ return d;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd,
+ const int size) {
int a1, i, j;
tran_low_t out;
__m128i dc, d;
- out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
dc = _mm_set1_epi16(a1);
for (i = 0; i < size; ++i) {
- for (j = 0; j < (size >> 3); ++j) {
- d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
- d = add_dc_clamp(&zero, &max, &dc, &d);
- _mm_store_si128((__m128i *)(&dest[j * 8]), d);
+ for (j = 0; j < size; j += 8) {
+ d = _mm_load_si128((const __m128i *)(&dest[j]));
+ d = add_clamp(d, dc, bd);
+ _mm_store_si128((__m128i *)(&dest[j]), d);
}
dest += stride;
}
}
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
- __m128i ubounded, retval;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- ubounded = _mm_cmpgt_epi16(value, max);
- retval = _mm_andnot_si128(ubounded, value);
- ubounded = _mm_and_si128(ubounded, max);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
- return retval;
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+ const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+ d = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
+}
+
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_4x2(in[0], dest, stride, bd);
+ dest += 2 * stride;
+ recon_and_store_4x2(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_load_si128((const __m128i *)(*dest));
+ d = add_clamp(d, in, bd);
+ _mm_store_si128((__m128i *)(*dest), d);
+ *dest += stride;
+}
+
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_8(in[0], &dest, stride, bd);
+ recon_and_store_8(in[1], &dest, stride, bd);
+ recon_and_store_8(in[2], &dest, stride, bd);
+ recon_and_store_8(in[3], &dest, stride, bd);
+ recon_and_store_8(in[4], &dest, stride, bd);
+ recon_and_store_8(in[5], &dest, stride, bd);
+ recon_and_store_8(in[6], &dest, stride, bd);
+ recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+ const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+ const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+ return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
+ const int stride,
+ __m128i *const in) {
+ in[0] = load_pack_8_32bit(input + 0 * stride);
+ in[1] = load_pack_8_32bit(input + 1 * stride);
+ in[2] = load_pack_8_32bit(input + 2 * stride);
+ in[3] = load_pack_8_32bit(input + 3 * stride);
+ in[4] = load_pack_8_32bit(input + 4 * stride);
+ in[5] = load_pack_8_32bit(input + 5 * stride);
+ in[6] = load_pack_8_32bit(input + 6 * stride);
+ in[7] = load_pack_8_32bit(input + 7 * stride);
+ transpose_16bit_8x8(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
+ const int stride,
+ __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
+ in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
+ in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
+ in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
+ transpose_32bit_8x4(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
+ const int stride,
+ __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ transpose_32bit_4x4(in, in);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+ __m128i out;
+
+ out = _mm_add_epi32(in, final_rounding);
+ out = _mm_srai_epi32(out, 6);
+ out = _mm_packs_epi32(out, out);
+ recon_and_store_4(out, dest, bd);
}
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
new file mode 100644
index 000000000..9c8eef40f
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+
+static INLINE __m128i multiplication_round_shift_sse4_1(
+ const __m128i *const in /*in[2]*/, const int c) {
+ const __m128i pair_c = pair_set_epi32(c * 4, 0);
+ __m128i t0, t1;
+
+ t0 = _mm_mul_epi32(in[0], pair_c);
+ t1 = _mm_mul_epi32(in[1], pair_c);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i temp1[4], temp2[4];
+
+ extend_64bit(in0, temp1);
+ extend_64bit(in1, temp2);
+ temp1[2] = _mm_mul_epi32(temp1[0], pair_c1);
+ temp1[3] = _mm_mul_epi32(temp1[1], pair_c1);
+ temp1[0] = _mm_mul_epi32(temp1[0], pair_c0);
+ temp1[1] = _mm_mul_epi32(temp1[1], pair_c0);
+ temp2[2] = _mm_mul_epi32(temp2[0], pair_c0);
+ temp2[3] = _mm_mul_epi32(temp2[1], pair_c0);
+ temp2[0] = _mm_mul_epi32(temp2[0], pair_c1);
+ temp2[1] = _mm_mul_epi32(temp2[1], pair_c1);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ *out0 = pack_4(temp1[0], temp1[1]);
+ *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2;
+
+ temp2 = _mm_add_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+}
+
+static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2];
+
+ extend_64bit(in, temp);
+ *out0 = multiplication_round_shift_sse4_1(temp, c0);
+ *out1 = multiplication_round_shift_sse4_1(temp, c1);
+}
+
+#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 8670b2895..ec22db9f4 100644
--- a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -12,7 +12,6 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
__m128i ubounded;
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 2362476c1..cedf98aff 100644
--- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <emmintrin.h>
#include "vpx_dsp/vpx_dsp_common.h"
@@ -37,54 +38,54 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
(void)scan;
+ (void)skip_block;
+ assert(!skip_block);
memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = ((int)count / 4) - 1; i >= 0; i--) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (test == 0xffff)
- non_zero_regs--;
- else
- break;
- }
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
- // Quantization pass:
- for (i = 0; i < non_zero_regs; i++) {
- __m128i coeffs, coeffs_sign, tmp1, tmp2;
- int test;
- int abs_coeff[4];
- int coeff_sign[4];
-
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- coeffs_sign = _mm_srai_epi32(coeffs, 31);
- coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
- tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
- tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
- tmp1 = _mm_or_si128(tmp1, tmp2);
- test = _mm_movemask_epi8(tmp1);
- _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
- _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
- for (j = 0; j < 4; j++) {
- if (test & (1 << (4 * j))) {
- int k = 4 * i + j;
- const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
- const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
- qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
- dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
- if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
- }
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
}
}
}
@@ -105,6 +106,9 @@ void vpx_highbd_quantize_b_32x32_sse2(
const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
(void)scan;
+ (void)skip_block;
+ assert(!skip_block);
+
zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
zbins[1] = _mm_set1_epi32(zbin1_tmp);
@@ -116,38 +120,35 @@ void vpx_highbd_quantize_b_32x32_sse2(
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = 0; i < n_coeffs / 4; i++) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (!(test & 0xf)) idx_arr[idx++] = i * 4;
- if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
- if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
- if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
- }
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = idx_arr[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 =
- abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
- }
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
*eob_ptr = eob + 1;
}
diff --git a/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 30ee81b68..d9a6932e0 100644
--- a/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -72,7 +72,7 @@ SECTION .text
paddd m6, m4
mov r1, ssem ; r1 = unsigned int *sse
movd [r1], m7 ; store sse
- movd rax, m6 ; store sum as return value
+ movd eax, m6 ; store sum as return value
%endif
RET
%endmacro
diff --git a/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
index 923418a99..e646767e1 100644
--- a/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;unsigned int vpx_highbd_calc16x16var_sse2
;(
; unsigned char * src_ptr,
diff --git a/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/libvpx/vpx_dsp/x86/intrapred_sse2.asm
index c18095c28..61af6236e 100644
--- a/libvpx/vpx_dsp/x86/intrapred_sse2.asm
+++ b/libvpx/vpx_dsp/x86/intrapred_sse2.asm
@@ -61,7 +61,7 @@ cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
psrlq m3, 8
movd [dstq+strideq ], m3
psrlq m0, 56
- movd tempq, m0
+ movd tempd, m0
mov [dstq+strideq+3], tempb
RESTORE_GOT
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index f75dab07a..f6e56b6f9 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -8,19 +8,29 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
+
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
+static INLINE void transpose_16bit_4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i eight = _mm_set1_epi16(8);
__m128i in[2];
// Rows
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
idct4_sse2(in);
// Columns
@@ -41,7 +51,7 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int a;
__m128i dc_value, d[2];
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+ a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64);
a = (int)dct_const_round_shift(a * cospi_16_64);
a = ROUND_POWER_OF_TWO(a, 4);
@@ -69,35 +79,19 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
*(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
}
-void idct4_sse2(__m128i *in) {
+void idct4_sse2(__m128i *const in) {
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
+ __m128i u[2];
- transpose_16bit_4x4(in);
+ transpose_16bit_4(in);
// stage 1
u[0] = _mm_unpacklo_epi16(in[0], in[1]);
u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[3], v[2]);
+ u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+ u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
// stage 2
in[0] = _mm_add_epi16(u[0], u[1]);
@@ -105,7 +99,7 @@ void idct4_sse2(__m128i *in) {
in[1] = _mm_shuffle_epi32(in[1], 0x4E);
}
-void iadst4_sse2(__m128i *in) {
+void iadst4_sse2(__m128i *const in) {
const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -115,7 +109,7 @@ void iadst4_sse2(__m128i *in) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8], in7;
- transpose_16bit_4x4(in);
+ transpose_16bit_4(in);
in7 = _mm_srli_si128(in[1], 8);
in7 = _mm_add_epi16(in7, in[0]);
in7 = _mm_sub_epi16(in7, in[1]);
@@ -154,215 +148,93 @@ void iadst4_sse2(__m128i *in) {
in[1] = _mm_packs_epi32(u[2], u[3]);
}
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
- out4, out5, out6, out7) \
- { \
- /* Stage1 */ \
- { \
- const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
- const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
- const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
- const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
- \
- MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
- stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
- const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
- const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
- const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
- \
- MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
- stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
- tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
- tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- } \
- \
- /* Stage4 */ \
- out0 = _mm_add_epi16(stp1_0, stp2_7); \
- out1 = _mm_add_epi16(stp1_1, stp1_6); \
- out2 = _mm_add_epi16(stp1_2, stp1_5); \
- out3 = _mm_add_epi16(stp1_3, stp2_4); \
- out4 = _mm_sub_epi16(stp1_3, stp2_4); \
- out5 = _mm_sub_epi16(stp1_2, stp1_5); \
- out6 = _mm_sub_epi16(stp1_1, stp1_6); \
- out7 = _mm_sub_epi16(stp1_0, stp2_7); \
- }
+static INLINE void load_buffer_8x8(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 8);
+ in[1] = load_input_data8(input + 1 * 8);
+ in[2] = load_input_data8(input + 2 * 8);
+ in[3] = load_input_data8(input + 3 * 8);
+ in[4] = load_input_data8(input + 4 * 8);
+ in[5] = load_input_data8(input + 5 * 8);
+ in[6] = load_input_data8(input + 6 * 8);
+ in[7] = load_input_data8(input + 7 * 8);
+}
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i in[8];
int i;
// Load input data.
- in0 = load_input_data(input);
- in1 = load_input_data(input + 8 * 1);
- in2 = load_input_data(input + 8 * 2);
- in3 = load_input_data(input + 8 * 3);
- in4 = load_input_data(input + 8 * 4);
- in5 = load_input_data(input + 8 * 5);
- in6 = load_input_data(input + 8 * 6);
- in7 = load_input_data(input + 8 * 7);
+ load_buffer_8x8(input, in);
// 2-D
for (i = 0; i < 2; i++) {
- // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
-
- // 4-stage 1D idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
- in6, in7);
+ idct8_sse2(in);
}
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
+ write_buffer_8x8(in, dest, stride);
}
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 5);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[8];
- dc_value = _mm_set1_epi16(a);
+ io[0] = load_input_data4(input + 0 * 8);
+ io[1] = load_input_data4(input + 1 * 8);
+ io[2] = load_input_data4(input + 2 * 8);
+ io[3] = load_input_data4(input + 3 * 8);
- RECON_AND_STORE(dest + 0 * stride, dc_value);
- RECON_AND_STORE(dest + 1 * stride, dc_value);
- RECON_AND_STORE(dest + 2 * stride, dc_value);
- RECON_AND_STORE(dest + 3 * stride, dc_value);
- RECON_AND_STORE(dest + 4 * stride, dc_value);
- RECON_AND_STORE(dest + 5 * stride, dc_value);
- RECON_AND_STORE(dest + 6 * stride, dc_value);
- RECON_AND_STORE(dest + 7 * stride, dc_value);
+ idct8x8_12_add_kernel_sse2(io);
+ write_buffer_8x8(io, dest, stride);
}
-void idct8_sse2(__m128i *in) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+ const __m128i in_x,
+ const int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+ d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d1 = _mm_unpacklo_epi8(d1, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
+}
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ dc_value = _mm_set1_epi16((int16_t)a1);
+
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+}
+void idct8_sse2(__m128i *const in) {
// 8x8 Transpose is copied from vpx_fdct8x8_sse2()
- TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
- in1, in2, in3, in4, in5, in6, in7);
+ transpose_16bit_8x8(in, in);
// 4-stage 1D idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
- in[4], in[5], in[6], in[7]);
+ idct8(in, in);
}
-void iadst8_sse2(__m128i *in) {
+void iadst8_sse2(__m128i *const in) {
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -375,7 +247,7 @@ void iadst8_sse2(__m128i *in) {
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__const_0 = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -386,7 +258,7 @@ void iadst8_sse2(__m128i *in) {
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
// transpose
- array_transpose_8x8(in, in);
+ transpose_16bit_8x8(in, in);
// properly aligned for butterfly input
in0 = in[7];
@@ -548,37 +420,10 @@ void iadst8_sse2(__m128i *in) {
u2 = _mm_unpacklo_epi16(s6, s7);
u3 = _mm_unpackhi_epi16(s6, s7);
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
+ s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
+ s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
+ s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
+ s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
in[0] = s0;
in[1] = _mm_sub_epi16(k__const_0, s4);
@@ -590,521 +435,133 @@ void iadst8_sse2(__m128i *in) {
in[7] = _mm_sub_epi16(k__const_0, s1);
}
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
+}
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- // Rows. Load 4-row input data.
- in0 = load_input_data(input);
- in1 = load_input_data(input + 8 * 1);
- in2 = load_input_data(input + 8 * 2);
- in3 = load_input_data(input + 8 * 3);
-
- // 8x4 Transpose
- TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
- // Stage1
- {
- const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
- const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_17, stg1_0);
- tmp2 = _mm_madd_epi16(lo_17, stg1_1);
- tmp4 = _mm_madd_epi16(lo_35, stg1_2);
- tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- stp1_5 = _mm_packs_epi32(tmp4, tmp6);
- }
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i l[16], r[16], out[16], *in;
+ int i;
- // Stage2
- {
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
- const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_04, stg2_0);
- tmp2 = _mm_madd_epi16(lo_04, stg2_1);
- tmp4 = _mm_madd_epi16(lo_26, stg2_2);
- tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp2_0 = _mm_packs_epi32(tmp0, tmp2);
- stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
- tmp0 = _mm_add_epi16(stp1_4, stp1_5);
- tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp0;
- stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+ in = l;
+ for (i = 0; i < 2; i++) {
+ idct16_load8x8(input, in);
+ transpose_16bit_8x8(in, in);
+ idct16_load8x8(input + 8, in + 8);
+ transpose_16bit_8x8(in + 8, in + 8);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
}
- // Stage3
- {
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
- tmp4 = _mm_add_epi16(stp2_0, stp2_2);
- tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, out[j]);
+ }
- stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
- stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+ dest += 8;
+ }
+}
- tmp0 = _mm_madd_epi16(lo_56, stg3_0);
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[16], temp[16], out[16];
+ int i;
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ idct16_load8x8(input, in);
+ transpose_16bit_8x8(in, in);
- stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
}
+ idct16_8col(in, temp);
- // Stage4
- tmp0 = _mm_add_epi16(stp1_3, stp2_4);
- tmp1 = _mm_add_epi16(stp1_2, stp1_5);
- tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
- tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
- TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
- IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
- in5, in6, in7);
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
-#define IDCT16 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
- const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
- const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
- const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
- const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
- stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
- \
- MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
- stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
- const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
- stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
- \
- stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- \
- stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
- const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
- stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- }
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, out[j]);
+ }
-#define IDCT16_10 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
- stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
- stp1_12_0) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
- \
- stp1_9 = stp1_8_0; \
- stp1_10 = stp1_11; \
- \
- stp1_13 = stp1_12_0; \
- stp1_14 = stp1_15; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
- stp2_5 = stp2_4; \
- stp2_6 = stp2_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_2 = stp1_1; \
- stp1_3 = stp1_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
+ dest += 8;
}
+}
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[16], l[16], r[16], *curr1;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[16], l[16];
int i;
- curr1 = l;
- for (i = 0; i < 2; i++) {
- // 1-D idct
-
- // Load input data.
- in[0] = load_input_data(input);
- in[8] = load_input_data(input + 8 * 1);
- in[1] = load_input_data(input + 8 * 2);
- in[9] = load_input_data(input + 8 * 3);
- in[2] = load_input_data(input + 8 * 4);
- in[10] = load_input_data(input + 8 * 5);
- in[3] = load_input_data(input + 8 * 6);
- in[11] = load_input_data(input + 8 * 7);
- in[4] = load_input_data(input + 8 * 8);
- in[12] = load_input_data(input + 8 * 9);
- in[5] = load_input_data(input + 8 * 10);
- in[13] = load_input_data(input + 8 * 11);
- in[6] = load_input_data(input + 8 * 12);
- in[14] = load_input_data(input + 8 * 13);
- in[7] = load_input_data(input + 8 * 14);
- in[15] = load_input_data(input + 8 * 15);
-
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
-
- IDCT16
-
- // Stage7
- curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
- curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
- curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
- curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
- curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
- curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
- curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
- curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
- curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
- curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
- curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
- curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
- curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
- curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
- curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
- curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- curr1 = r;
- input += 128;
- }
- for (i = 0; i < 2; i++) {
+ // First 1-D inverse DCT
+ // Load input data.
+ in[0] = load_input_data4(input + 0 * 16);
+ in[1] = load_input_data4(input + 1 * 16);
+ in[2] = load_input_data4(input + 2 * 16);
+ in[3] = load_input_data4(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ // Second 1-D inverse transform, performed per 8x16 block
+ for (i = 0; i < 16; i += 8) {
int j;
- // 1-D idct
- array_transpose_8x8(l + i * 8, in);
- array_transpose_8x8(r + i * 8, in + 8);
-
- IDCT16
-
- // 2-D
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+ idct16x16_10_pass2(l + i, in);
for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
+ write_buffer_8x1(dest + j * stride, in[j]);
}
dest += 8;
}
}
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_load_si128((__m128i *)(dest));
+ d1 = _mm_unpackhi_epi8(d0, zero);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_store_si128((__m128i *)(dest), d0);
+}
+
void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, i;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
+ int i;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
- dc_value = _mm_set1_epi16(a);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16((int16_t)a1);
for (i = 0; i < 16; ++i) {
- RECON_AND_STORE(dest + 0, dc_value);
- RECON_AND_STORE(dest + 8, dc_value);
+ recon_and_store_16(dest, dc_value);
dest += stride;
}
}
-static void iadst16_8col(__m128i *in) {
+static void iadst16_8col(__m128i *const in) {
// perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1132,8 +589,8 @@ static void iadst16_8col(__m128i *in) {
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1505,1718 +962,371 @@ static void iadst16_8col(__m128i *in) {
u[6] = _mm_unpacklo_epi16(s[14], s[15]);
u[7] = _mm_unpackhi_epi16(s[14], s[15]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+ in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+ in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+ in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+ in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+ in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+ in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+ in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+ in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
in[0] = s[0];
in[1] = _mm_sub_epi16(kZero, s[8]);
in[2] = s[12];
in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
in[12] = s[5];
in[13] = _mm_sub_epi16(kZero, s[13]);
in[14] = s[9];
in[15] = _mm_sub_epi16(kZero, s[1]);
}
-static void idct16_8col(__m128i *in) {
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i v[16], u[16], s[16], t[16];
+void idct16_sse2(__m128i *const in0, __m128i *const in1) {
+ transpose_16bit_16x16(in0, in1);
+ idct16_8col(in0, in0);
+ idct16_8col(in1, in1);
+}
- // stage 1
- s[0] = in[0];
- s[1] = in[8];
- s[2] = in[4];
- s[3] = in[12];
- s[4] = in[2];
- s[5] = in[10];
- s[6] = in[6];
- s[7] = in[14];
- s[8] = in[1];
- s[9] = in[9];
- s[10] = in[5];
- s[11] = in[13];
- s[12] = in[3];
- s[13] = in[11];
- s[14] = in[7];
- s[15] = in[15];
+void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
+ transpose_16bit_16x16(in0, in1);
+ iadst16_8col(in0);
+ iadst16_8col(in1);
+}
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[15]);
- u[1] = _mm_unpackhi_epi16(s[8], s[15]);
- u[2] = _mm_unpacklo_epi16(s[9], s[14]);
- u[3] = _mm_unpackhi_epi16(s[9], s[14]);
- u[4] = _mm_unpacklo_epi16(s[10], s[13]);
- u[5] = _mm_unpackhi_epi16(s[10], s[13]);
- u[6] = _mm_unpacklo_epi16(s[11], s[12]);
- u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[8] = _mm_packs_epi32(u[0], u[1]);
- s[15] = _mm_packs_epi32(u[2], u[3]);
- s[9] = _mm_packs_epi32(u[4], u[5]);
- s[14] = _mm_packs_epi32(u[6], u[7]);
- s[10] = _mm_packs_epi32(u[8], u[9]);
- s[13] = _mm_packs_epi32(u[10], u[11]);
- s[11] = _mm_packs_epi32(u[12], u[13]);
- s[12] = _mm_packs_epi32(u[14], u[15]);
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[8], step2[8];
// stage 3
- t[0] = s[0];
- t[1] = s[1];
- t[2] = s[2];
- t[3] = s[3];
- u[0] = _mm_unpacklo_epi16(s[4], s[7]);
- u[1] = _mm_unpackhi_epi16(s[4], s[7]);
- u[2] = _mm_unpacklo_epi16(s[5], s[6]);
- u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[4] = _mm_packs_epi32(u[0], u[1]);
- t[7] = _mm_packs_epi32(u[2], u[3]);
- t[5] = _mm_packs_epi32(u[4], u[5]);
- t[6] = _mm_packs_epi32(u[6], u[7]);
- t[8] = _mm_add_epi16(s[8], s[9]);
- t[9] = _mm_sub_epi16(s[8], s[9]);
- t[10] = _mm_sub_epi16(s[11], s[10]);
- t[11] = _mm_add_epi16(s[10], s[11]);
- t[12] = _mm_add_epi16(s[12], s[13]);
- t[13] = _mm_sub_epi16(s[12], s[13]);
- t[14] = _mm_sub_epi16(s[15], s[14]);
- t[15] = _mm_add_epi16(s[14], s[15]);
+ butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
// stage 4
- u[0] = _mm_unpacklo_epi16(t[0], t[1]);
- u[1] = _mm_unpackhi_epi16(t[0], t[1]);
- u[2] = _mm_unpacklo_epi16(t[2], t[3]);
- u[3] = _mm_unpackhi_epi16(t[2], t[3]);
- u[4] = _mm_unpacklo_epi16(t[9], t[14]);
- u[5] = _mm_unpackhi_epi16(t[9], t[14]);
- u[6] = _mm_unpacklo_epi16(t[10], t[13]);
- u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_add_epi16(t[4], t[5]);
- s[5] = _mm_sub_epi16(t[4], t[5]);
- s[6] = _mm_sub_epi16(t[7], t[6]);
- s[7] = _mm_add_epi16(t[6], t[7]);
- s[8] = t[8];
- s[15] = t[15];
- s[9] = _mm_packs_epi32(u[8], u[9]);
- s[14] = _mm_packs_epi32(u[10], u[11]);
- s[10] = _mm_packs_epi32(u[12], u[13]);
- s[13] = _mm_packs_epi32(u[14], u[15]);
- s[11] = t[11];
- s[12] = t[12];
+ step2[0] = butterfly_cospi16(in[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
// stage 5
- t[0] = _mm_add_epi16(s[0], s[3]);
- t[1] = _mm_add_epi16(s[1], s[2]);
- t[2] = _mm_sub_epi16(s[1], s[2]);
- t[3] = _mm_sub_epi16(s[0], s[3]);
- t[4] = s[4];
- t[7] = s[7];
-
- u[0] = _mm_unpacklo_epi16(s[5], s[6]);
- u[1] = _mm_unpackhi_epi16(s[5], s[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- t[5] = _mm_packs_epi32(u[0], u[1]);
- t[6] = _mm_packs_epi32(u[2], u[3]);
-
- t[8] = _mm_add_epi16(s[8], s[11]);
- t[9] = _mm_add_epi16(s[9], s[10]);
- t[10] = _mm_sub_epi16(s[9], s[10]);
- t[11] = _mm_sub_epi16(s[8], s[11]);
- t[12] = _mm_sub_epi16(s[15], s[12]);
- t[13] = _mm_sub_epi16(s[14], s[13]);
- t[14] = _mm_add_epi16(s[13], s[14]);
- t[15] = _mm_add_epi16(s[12], s[15]);
+ step1[0] = step2[0];
+ step1[1] = step2[0];
+ step1[2] = step2[0];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
// stage 6
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_unpacklo_epi16(t[10], t[13]);
- u[1] = _mm_unpackhi_epi16(t[10], t[13]);
- u[2] = _mm_unpacklo_epi16(t[11], t[12]);
- u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- s[10] = _mm_packs_epi32(u[0], u[1]);
- s[13] = _mm_packs_epi32(u[2], u[3]);
- s[11] = _mm_packs_epi32(u[4], u[5]);
- s[12] = _mm_packs_epi32(u[6], u[7]);
- s[14] = t[14];
- s[15] = t[15];
-
- // stage 7
- in[0] = _mm_add_epi16(s[0], s[15]);
- in[1] = _mm_add_epi16(s[1], s[14]);
- in[2] = _mm_add_epi16(s[2], s[13]);
- in[3] = _mm_add_epi16(s[3], s[12]);
- in[4] = _mm_add_epi16(s[4], s[11]);
- in[5] = _mm_add_epi16(s[5], s[10]);
- in[6] = _mm_add_epi16(s[6], s[9]);
- in[7] = _mm_add_epi16(s[7], s[8]);
- in[8] = _mm_sub_epi16(s[7], s[8]);
- in[9] = _mm_sub_epi16(s[6], s[9]);
- in[10] = _mm_sub_epi16(s[5], s[10]);
- in[11] = _mm_sub_epi16(s[4], s[11]);
- in[12] = _mm_sub_epi16(s[3], s[12]);
- in[13] = _mm_sub_epi16(s[2], s[13]);
- in[14] = _mm_sub_epi16(s[1], s[14]);
- in[15] = _mm_sub_epi16(s[0], s[15]);
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
}
-void idct16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- idct16_8col(in0);
- idct16_8col(in1);
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
}
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- iadst16_8col(in0);
- iadst16_8col(in1);
+static INLINE void idct32_34_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_34_8x32_quarter_1(in, temp);
+ idct32_34_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
}
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
const __m128i zero = _mm_setzero_si128();
+ __m128i step1[32];
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ // stage 1
+ butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+ butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ // stage 3
+ butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i in[16], l[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
- stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
- stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
- // First 1-D inverse DCT
- // Load input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 2);
- in[2] = load_input_data(input + 8 * 4);
- in[3] = load_input_data(input + 8 * 6);
-
- TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
- // Stage2
- {
- const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
- const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
- tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
- tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
- tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
- tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp2_8 = _mm_packs_epi32(tmp0, tmp2);
- stp2_11 = _mm_packs_epi32(tmp5, tmp7);
- }
+ idct32_34_8x32_quarter_1_2(in, temp);
+ idct32_34_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
- // Stage3
- {
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[32], col[32];
+ int i;
- tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
- tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+ // Load input data. Only need to load the top left 8x8 block.
+ load_transpose_16bit_8x8(input, 32, io);
+ idct32_34_8x32_sse2(io, col);
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ for (i = 0; i < 32; i += 8) {
+ int j;
+ transpose_16bit_8x8(col + i, io);
+ idct32_34_8x32_sse2(io, io);
- stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
- stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+ for (j = 0; j < 32; ++j) {
+ write_buffer_8x1(dest + j * stride, io[j]);
+ }
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+ dest += 8;
}
+}
- // Stage4
- {
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
- tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
- tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
- tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
- tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
- tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
- tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp1_0 = _mm_packs_epi32(tmp0, tmp0);
- stp1_1 = _mm_packs_epi32(tmp2, tmp2);
- stp2_9 = _mm_packs_epi32(tmp1, tmp3);
- stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
- stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
- }
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
- // Stage5 and Stage6
- {
- tmp0 = _mm_add_epi16(stp2_8, stp2_11);
- tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
- tmp2 = _mm_add_epi16(stp2_9, stp2_10);
- tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
- stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
- stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
- stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
- stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
- stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
- stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
- stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
- stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
- }
+ // stage 3
+ butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
- // Stage6
- {
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
- tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
- tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
- tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
- tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
-
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
- stp2_10 = _mm_packs_epi32(tmp0, zero);
- stp2_13 = _mm_packs_epi32(tmp2, zero);
- stp2_11 = _mm_packs_epi32(tmp4, zero);
- stp2_12 = _mm_packs_epi32(tmp6, zero);
-
- tmp0 = _mm_add_epi16(stp1_0, stp1_4);
- tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
- tmp2 = _mm_add_epi16(stp1_1, stp1_6);
- tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
- stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
- stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
- stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
- stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
- stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
- stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
- stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
- }
+ // stage 4
+ butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
- // Stage7. Left 8x16 only.
- l[0] = _mm_add_epi16(stp2_0, stp1_15);
- l[1] = _mm_add_epi16(stp2_1, stp1_14);
- l[2] = _mm_add_epi16(stp2_2, stp2_13);
- l[3] = _mm_add_epi16(stp2_3, stp2_12);
- l[4] = _mm_add_epi16(stp2_4, stp2_11);
- l[5] = _mm_add_epi16(stp2_5, stp2_10);
- l[6] = _mm_add_epi16(stp2_6, stp1_9);
- l[7] = _mm_add_epi16(stp2_7, stp1_8);
- l[8] = _mm_sub_epi16(stp2_7, stp1_8);
- l[9] = _mm_sub_epi16(stp2_6, stp1_9);
- l[10] = _mm_sub_epi16(stp2_5, stp2_10);
- l[11] = _mm_sub_epi16(stp2_4, stp2_11);
- l[12] = _mm_sub_epi16(stp2_3, stp2_12);
- l[13] = _mm_sub_epi16(stp2_2, stp2_13);
- l[14] = _mm_sub_epi16(stp2_1, stp1_14);
- l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
- // Second 1-D inverse transform, performed per 8x16 block
- for (i = 0; i < 2; i++) {
- int j;
- array_transpose_4X8(l + 8 * i, in);
-
- IDCT16_10
-
- // Stage7
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
- for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
- dest += 8;
- }
+ // stage 2
+ butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
}
-#define LOAD_DQCOEFF(reg, input) \
- { \
- reg = load_input_data(input); \
- input += 8; \
- }
+static INLINE void idct32_1024_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_1024_8x32_quarter_1(in, temp);
+ idct32_1024_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
-#define IDCT32_34 \
- /* Stage1 */ \
- { \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
- \
- const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
- \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
- stp1_31); \
- MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
- stp1_28); \
- MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
- stp1_27); \
- MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
- stp1_24); \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
- \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
- stp2_15); \
- MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
- stp2_12); \
- \
- stp2_16 = stp1_16; \
- stp2_19 = stp1_19; \
- \
- stp2_20 = stp1_20; \
- stp2_23 = stp1_23; \
- \
- stp2_24 = stp1_24; \
- stp2_27 = stp1_27; \
- \
- stp2_28 = stp1_28; \
- stp2_31 = stp1_31; \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
- \
- MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
- stp1_7); \
- \
- stp1_8 = stp2_8; \
- stp1_11 = stp2_11; \
- stp1_12 = stp2_12; \
- stp1_15 = stp2_15; \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
- stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
- stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
- stp2_1); \
- \
- stp2_4 = stp1_4; \
- stp2_5 = stp1_4; \
- stp2_6 = stp1_7; \
- stp2_7 = stp1_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = stp2_0; \
- stp1_1 = stp2_1; \
- stp1_2 = stp2_1; \
- stp1_3 = stp2_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
- stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
- } \
- \
- /* Stage7 */ \
- { \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
- stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- }
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
-#define IDCT32 \
- /* Stage1 */ \
- { \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
- \
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
- const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
- \
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
- stp1_30) \
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
- stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
- \
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
- stp2_14) \
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
- \
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
- \
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- \
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
- stp1_6) \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
- stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
- stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
- stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
- stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
- } \
- \
- /* Stage7 */ \
- { \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
- stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- }
+ // stage 1
+ butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+ butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+ butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
-// Only upper-left 8x8 has non-zero coeff
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
- // idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[32];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
- stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
- stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
+ butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
- // Load input data. Only need to load the top left 8x8 block.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 32);
- in[2] = load_input_data(input + 64);
- in[3] = load_input_data(input + 96);
- in[4] = load_input_data(input + 128);
- in[5] = load_input_data(input + 160);
- in[6] = load_input_data(input + 192);
- in[7] = load_input_data(input + 224);
-
- array_transpose_8x8(in, in);
- IDCT32_34
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[0] = _mm_add_epi16(stp1_0, stp1_31);
- col[1] = _mm_add_epi16(stp1_1, stp1_30);
- col[2] = _mm_add_epi16(stp1_2, stp1_29);
- col[3] = _mm_add_epi16(stp1_3, stp1_28);
- col[4] = _mm_add_epi16(stp1_4, stp1_27);
- col[5] = _mm_add_epi16(stp1_5, stp1_26);
- col[6] = _mm_add_epi16(stp1_6, stp1_25);
- col[7] = _mm_add_epi16(stp1_7, stp1_24);
- col[8] = _mm_add_epi16(stp1_8, stp1_23);
- col[9] = _mm_add_epi16(stp1_9, stp1_22);
- col[10] = _mm_add_epi16(stp1_10, stp1_21);
- col[11] = _mm_add_epi16(stp1_11, stp1_20);
- col[12] = _mm_add_epi16(stp1_12, stp1_19);
- col[13] = _mm_add_epi16(stp1_13, stp1_18);
- col[14] = _mm_add_epi16(stp1_14, stp1_17);
- col[15] = _mm_add_epi16(stp1_15, stp1_16);
- col[16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[31] = _mm_sub_epi16(stp1_0, stp1_31);
- for (i = 0; i < 4; i++) {
- int j;
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + i * 8, in);
- IDCT32_34
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+ butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+ butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
+ // stage 2
+ step2[16] = _mm_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm_add_epi16(step1[31], step1[30]);
- dest += 8;
- }
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_1024_8x32(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+
+ idct32_1024_8x32_quarter_1_2(in, temp);
+ idct32_1024_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
}
void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- // idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[128], zero_idx[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
- stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
- stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i, j, i32;
+ __m128i col[4][32], io[32];
+ int i;
+ // rows
for (i = 0; i < 4; i++) {
- i32 = (i << 5);
- // First 1-D idct
- // Load input data.
- LOAD_DQCOEFF(in[0], input);
- LOAD_DQCOEFF(in[8], input);
- LOAD_DQCOEFF(in[16], input);
- LOAD_DQCOEFF(in[24], input);
- LOAD_DQCOEFF(in[1], input);
- LOAD_DQCOEFF(in[9], input);
- LOAD_DQCOEFF(in[17], input);
- LOAD_DQCOEFF(in[25], input);
- LOAD_DQCOEFF(in[2], input);
- LOAD_DQCOEFF(in[10], input);
- LOAD_DQCOEFF(in[18], input);
- LOAD_DQCOEFF(in[26], input);
- LOAD_DQCOEFF(in[3], input);
- LOAD_DQCOEFF(in[11], input);
- LOAD_DQCOEFF(in[19], input);
- LOAD_DQCOEFF(in[27], input);
-
- LOAD_DQCOEFF(in[4], input);
- LOAD_DQCOEFF(in[12], input);
- LOAD_DQCOEFF(in[20], input);
- LOAD_DQCOEFF(in[28], input);
- LOAD_DQCOEFF(in[5], input);
- LOAD_DQCOEFF(in[13], input);
- LOAD_DQCOEFF(in[21], input);
- LOAD_DQCOEFF(in[29], input);
- LOAD_DQCOEFF(in[6], input);
- LOAD_DQCOEFF(in[14], input);
- LOAD_DQCOEFF(in[22], input);
- LOAD_DQCOEFF(in[30], input);
- LOAD_DQCOEFF(in[7], input);
- LOAD_DQCOEFF(in[15], input);
- LOAD_DQCOEFF(in[23], input);
- LOAD_DQCOEFF(in[31], input);
-
- // checking if all entries are zero
- zero_idx[0] = _mm_or_si128(in[0], in[1]);
- zero_idx[1] = _mm_or_si128(in[2], in[3]);
- zero_idx[2] = _mm_or_si128(in[4], in[5]);
- zero_idx[3] = _mm_or_si128(in[6], in[7]);
- zero_idx[4] = _mm_or_si128(in[8], in[9]);
- zero_idx[5] = _mm_or_si128(in[10], in[11]);
- zero_idx[6] = _mm_or_si128(in[12], in[13]);
- zero_idx[7] = _mm_or_si128(in[14], in[15]);
- zero_idx[8] = _mm_or_si128(in[16], in[17]);
- zero_idx[9] = _mm_or_si128(in[18], in[19]);
- zero_idx[10] = _mm_or_si128(in[20], in[21]);
- zero_idx[11] = _mm_or_si128(in[22], in[23]);
- zero_idx[12] = _mm_or_si128(in[24], in[25]);
- zero_idx[13] = _mm_or_si128(in[26], in[27]);
- zero_idx[14] = _mm_or_si128(in[28], in[29]);
- zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
- zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
- zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
- col[i32 + 0] = _mm_setzero_si128();
- col[i32 + 1] = _mm_setzero_si128();
- col[i32 + 2] = _mm_setzero_si128();
- col[i32 + 3] = _mm_setzero_si128();
- col[i32 + 4] = _mm_setzero_si128();
- col[i32 + 5] = _mm_setzero_si128();
- col[i32 + 6] = _mm_setzero_si128();
- col[i32 + 7] = _mm_setzero_si128();
- col[i32 + 8] = _mm_setzero_si128();
- col[i32 + 9] = _mm_setzero_si128();
- col[i32 + 10] = _mm_setzero_si128();
- col[i32 + 11] = _mm_setzero_si128();
- col[i32 + 12] = _mm_setzero_si128();
- col[i32 + 13] = _mm_setzero_si128();
- col[i32 + 14] = _mm_setzero_si128();
- col[i32 + 15] = _mm_setzero_si128();
- col[i32 + 16] = _mm_setzero_si128();
- col[i32 + 17] = _mm_setzero_si128();
- col[i32 + 18] = _mm_setzero_si128();
- col[i32 + 19] = _mm_setzero_si128();
- col[i32 + 20] = _mm_setzero_si128();
- col[i32 + 21] = _mm_setzero_si128();
- col[i32 + 22] = _mm_setzero_si128();
- col[i32 + 23] = _mm_setzero_si128();
- col[i32 + 24] = _mm_setzero_si128();
- col[i32 + 25] = _mm_setzero_si128();
- col[i32 + 26] = _mm_setzero_si128();
- col[i32 + 27] = _mm_setzero_si128();
- col[i32 + 28] = _mm_setzero_si128();
- col[i32 + 29] = _mm_setzero_si128();
- col[i32 + 30] = _mm_setzero_si128();
- col[i32 + 31] = _mm_setzero_si128();
- continue;
- }
+ load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+ load_transpose_16bit_8x8(&input[16], 32, &io[16]);
+ load_transpose_16bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+ // columns
+ for (i = 0; i < 32; i += 8) {
// Transpose 32x8 block to 8x32 block
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- IDCT32
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+
+ idct32_1024_8x32(io, io);
+ store_buffer_8x32(io, dest, stride);
+ dest += 8;
}
- for (i = 0; i < 4; i++) {
- // Second 1-D idct
- j = i << 3;
+}
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + j, in);
- array_transpose_8x8(col + j + 32, in + 8);
- array_transpose_8x8(col + j + 64, in + 16);
- array_transpose_8x8(col + j + 96, in + 24);
-
- IDCT32
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[2][32], in[32], out[32];
+ int i;
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ // rows
+ for (i = 0; i < 2; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &in[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &in[8]);
+ idct32_1024_8x32(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_1024_8x32(in, out);
+ store_buffer_8x32(out, dest, stride);
dest += 8;
}
}
@@ -3224,19 +1334,17 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, j;
+ int j;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
-
- dc_value = _mm_set1_epi16(a);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16((int16_t)a1);
for (j = 0; j < 32; ++j) {
- RECON_AND_STORE(dest + 0 + j * stride, dc_value);
- RECON_AND_STORE(dest + 8 + j * stride, dc_value);
- RECON_AND_STORE(dest + 16 + j * stride, dc_value);
- RECON_AND_STORE(dest + 24 + j * stride, dc_value);
+ recon_and_store_16(dest + j * stride + 0, dc_value);
+ recon_and_store_16(dest + j * stride + 16, dc_value);
}
}
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index 0460ab13b..5cd5098f1 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -12,272 +12,173 @@
#define VPX_DSP_X86_INV_TXFM_SSE2_H_
#include <emmintrin.h> // SSE2
+
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 30 31 32 33 00 01 02 03
+ // in[1]: 20 21 22 23 10 11 12 13
+ // in[2]: 40 41 42 43 70 71 72 73
+ // in[3]: 50 51 52 53 60 61 62 63
+ // to:
+ // tr0_0: 00 10 01 11 02 12 03 13
+ // tr0_1: 20 30 21 31 22 32 23 33
+ // tr0_2: 40 50 41 51 42 52 43 53
+ // tr0_3: 60 70 61 71 62 72 63 73
+ const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]);
+ const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]);
+ // Unpack 32 bit elements resulting in:
+ // tr1_0: 00 10 20 30 01 11 21 31
+ // tr1_1: 02 12 22 32 03 13 23 33
+ // tr1_2: 40 50 60 70 41 51 61 71
+ // tr1_3: 42 52 62 72 43 53 63 73
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
}
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
- const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
- const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
- const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- }
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- \
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
- in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
- }
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+ const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+ return _mm_srai_epi32(t, DCT_CONST_BITS);
+}
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+ const __m128i cospi) {
+ const __m128i t = _mm_madd_epi16(in, cospi);
+ return dct_const_round_shift_sse2(t);
+}
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+ const __m128i in1,
+ const __m128i x) {
+ const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+ const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+ return _mm_packs_epi32(t0, t1);
+}
- out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
- out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
- out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
- out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+// Multiply elements by constants and add them together.
+static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0,
+ const int c1, __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i cst0 = pair_set_epi16(c0, -c1);
+ const __m128i cst1 = pair_set_epi16(c1, c0);
+ const __m128i lo = _mm_unpacklo_epi16(in0, in1);
+ const __m128i hi = _mm_unpackhi_epi16(in0, in1);
+ *out0 = idct_calc_wraplow_sse2(lo, hi, cst0);
+ *out1 = idct_calc_wraplow_sse2(lo, hi, cst1);
}
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
+static INLINE __m128i butterfly_cospi16(const __m128i in) {
+ const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+ const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+ return idct_calc_wraplow_sse2(lo, hi, cst);
}
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
#if CONFIG_VP9_HIGHBITDEPTH
- return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
- data[6], data[7]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i in = _mm_load_si128((const __m128i *)data);
+ return _mm_packs_epi32(in, zero);
#else
- return _mm_load_si128((const __m128i *)data);
+ return _mm_loadl_epi64((const __m128i *)data);
#endif
}
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
- in[0] = load_input_data(input + 0 * 16);
- in[1] = load_input_data(input + 1 * 16);
- in[2] = load_input_data(input + 2 * 16);
- in[3] = load_input_data(input + 3 * 16);
- in[4] = load_input_data(input + 4 * 16);
- in[5] = load_input_data(input + 5 * 16);
- in[6] = load_input_data(input + 6 * 16);
- in[7] = load_input_data(input + 7 * 16);
-
- in[8] = load_input_data(input + 8 * 16);
- in[9] = load_input_data(input + 9 * 16);
- in[10] = load_input_data(input + 10 * 16);
- in[11] = load_input_data(input + 11 * 16);
- in[12] = load_input_data(input + 12 * 16);
- in[13] = load_input_data(input + 13 * 16);
- in[14] = load_input_data(input + 14 * 16);
- in[15] = load_input_data(input + 15 * 16);
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i in0 = _mm_load_si128((const __m128i *)data);
+ const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
+ return _mm_packs_epi32(in0, in1);
+#else
+ return _mm_load_si128((const __m128i *)data);
+#endif
}
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- _mm_storel_epi64((__m128i *)(dest), d0); \
- }
+static INLINE void load_transpose_16bit_8x8(const tran_low_t *input,
+ const int stride,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * stride);
+ in[1] = load_input_data8(input + 1 * stride);
+ in[2] = load_input_data8(input + 2 * stride);
+ in[3] = load_input_data8(input + 3 * stride);
+ in[4] = load_input_data8(input + 4 * stride);
+ in[5] = load_input_data8(input + 5 * stride);
+ in[6] = load_input_data8(input + 6 * stride);
+ in[7] = load_input_data8(input + 7 * stride);
+ transpose_16bit_8x8(in, in);
+}
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
const __m128i zero = _mm_setzero_si128();
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
-
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
- RECON_AND_STORE(dest + 8 * stride, in[8]);
- RECON_AND_STORE(dest + 9 * stride, in[9]);
- RECON_AND_STORE(dest + 10 * stride, in[10]);
- RECON_AND_STORE(dest + 11 * stride, in[11]);
- RECON_AND_STORE(dest + 12 * stride, in[12]);
- RECON_AND_STORE(dest + 13 * stride, in[13]);
- RECON_AND_STORE(dest + 14 * stride, in[14]);
- RECON_AND_STORE(dest + 15 * stride, in[15]);
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d0 = _mm_packus_epi16(d0, d0);
+ _mm_storel_epi64((__m128i *)(dest), d0);
}
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
- { \
- const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
- const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- }
+static INLINE void round_shift_8x8(const __m128i *const in,
+ __m128i *const out) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- }
+ out[0] = _mm_add_epi16(in[0], final_rounding);
+ out[1] = _mm_add_epi16(in[1], final_rounding);
+ out[2] = _mm_add_epi16(in[2], final_rounding);
+ out[3] = _mm_add_epi16(in[3], final_rounding);
+ out[4] = _mm_add_epi16(in[4], final_rounding);
+ out[5] = _mm_add_epi16(in[5], final_rounding);
+ out[6] = _mm_add_epi16(in[6], final_rounding);
+ out[7] = _mm_add_epi16(in[7], final_rounding);
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
- res0, res1, res2, res3) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
- }
+ out[0] = _mm_srai_epi16(out[0], 5);
+ out[1] = _mm_srai_epi16(out[1], 5);
+ out[2] = _mm_srai_epi16(out[2], 5);
+ out[3] = _mm_srai_epi16(out[3], 5);
+ out[4] = _mm_srai_epi16(out[4], 5);
+ out[5] = _mm_srai_epi16(out[5], 5);
+ out[6] = _mm_srai_epi16(out[6], 5);
+ out[7] = _mm_srai_epi16(out[7], 5);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *const in,
+ uint8_t *const dest, const int stride) {
+ __m128i t[8];
+
+ round_shift_8x8(in, t);
+
+ recon_and_store(dest + 0 * stride, t[0]);
+ recon_and_store(dest + 1 * stride, t[1]);
+ recon_and_store(dest + 2 * stride, t[2]);
+ recon_and_store(dest + 3 * stride, t[3]);
+ recon_and_store(dest + 4 * stride, t[4]);
+ recon_and_store(dest + 5 * stride, t[5]);
+ recon_and_store(dest + 6 * stride, t[6]);
+ recon_and_store(dest + 7 * stride, t[7]);
+}
static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
uint8_t *const dest,
@@ -307,11 +208,502 @@ static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
*(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
}
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm_srai_epi16(in[j], 6);
+ in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+ recon_and_store(dst, in[j]);
+ dst += stride;
+ recon_and_store(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store(dest, out);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+static INLINE void idct8(const __m128i *const in /*in[8]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 1
+ butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 2
+ butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+ // stage 4
+ out[0] = _mm_add_epi16(step1[0], step2[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step2[4]);
+ out[4] = _mm_sub_epi16(step1[3], step2[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ __m128i step1[8], step2[8], tmp[4];
+
+ transpose_16bit_4x4(io, io);
+ // io[0]: 00 10 20 30 01 11 21 31
+ // io[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ {
+ const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero);
+ const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7
+ step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6
+ }
+
+ // stage 2
+ {
+ const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero);
+ const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero);
+ const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0);
+ step2[0] = _mm_packs_epi32(t, t); // step2 0&1
+ step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6
+ }
+
+ // stage 3
+ {
+ const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6
+ }
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ idct8x8_12_transpose_16bit_4x8(tmp, io);
+ io[4] = io[5] = io[6] = io[7] = zero;
+
+ idct8(io, io);
+}
+
+static INLINE void idct16_8col(const __m128i *const in /*in[16]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[10], step2[11]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[14], step2[15]);
+
+ // stage 4
+ butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step1[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step1[7] = _mm_add_epi16(step1[6], step1[7]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm_add_epi16(step1[1], step1[6]);
+ step2[2] = _mm_add_epi16(step1[2], step1[5]);
+ step2[3] = _mm_add_epi16(step1[3], step1[4]);
+ step2[4] = _mm_sub_epi16(step1[3], step1[4]);
+ step2[5] = _mm_sub_epi16(step1[2], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[1], step1[6]);
+ step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ out[0] = _mm_add_epi16(step2[0], step1[15]);
+ out[1] = _mm_add_epi16(step2[1], step1[14]);
+ out[2] = _mm_add_epi16(step2[2], step2[13]);
+ out[3] = _mm_add_epi16(step2[3], step2[12]);
+ out[4] = _mm_add_epi16(step2[4], step2[11]);
+ out[5] = _mm_add_epi16(step2[5], step2[10]);
+ out[6] = _mm_add_epi16(step2[6], step1[9]);
+ out[7] = _mm_add_epi16(step2[7], step1[8]);
+ out[8] = _mm_sub_epi16(step2[7], step1[8]);
+ out[9] = _mm_sub_epi16(step2[6], step1[9]);
+ out[10] = _mm_sub_epi16(step2[5], step2[10]);
+ out[11] = _mm_sub_epi16(step2[4], step2[11]);
+ out[12] = _mm_sub_epi16(step2[3], step2[12]);
+ out[13] = _mm_sub_epi16(step2[2], step2[13]);
+ out[14] = _mm_sub_epi16(step2[1], step1[14]);
+ out[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/,
+ __m128i *const output /*output[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i step1[16], step2[16];
+
+ transpose_16bit_4x4(input, output);
+
+ // stage 2
+ {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]);
+ step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30,
+ lo_1_15); // step2 8&15
+ step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06,
+ lo_13_3); // step2 11&12
+ }
+
+ // stage 3
+ {
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28,
+ lo_2_14); // step1 4&7
+ step1[13] = _mm_unpackhi_epi64(step2[11], zero);
+ step1[14] = _mm_unpackhi_epi64(step2[8], zero);
+ }
+
+ // stage 4
+ {
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]);
+ const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16);
+ step1[0] = _mm_packs_epi32(t, t); // step2 0&1
+ step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08,
+ lo_9_14); // step2 9&14
+ step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24,
+ lo_10_13); // step2 10&13
+ step2[6] = _mm_unpackhi_epi64(step1[4], zero);
+ }
+
+ // stage 5
+ {
+ const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]);
+ step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16,
+ lo_5_6); // step1 6&5
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_unpackhi_epi64(step1[11], zero);
+ step1[13] = _mm_unpackhi_epi64(step1[10], zero);
+ step1[14] = _mm_unpackhi_epi64(step1[9], zero);
+ step1[15] = _mm_unpackhi_epi64(step1[8], zero);
+ }
+
+ // stage 6
+ {
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]);
+ step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+ lo_10_13); // step2 10&13
+ step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+ lo_11_12); // step2 11&12
+ step2[13] = _mm_unpackhi_epi64(step2[10], zero);
+ step2[12] = _mm_unpackhi_epi64(step2[11], zero);
+ step2[3] = _mm_add_epi16(step1[0], step1[4]);
+ step2[1] = _mm_add_epi16(step1[0], step1[6]);
+ step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+ step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+ step2[0] = _mm_unpackhi_epi64(step2[3], zero);
+ step2[2] = _mm_unpackhi_epi64(step2[1], zero);
+ step2[5] = _mm_unpackhi_epi64(step2[6], zero);
+ step2[7] = _mm_unpackhi_epi64(step2[4], zero);
+ }
+
+ // stage 7. Left 8x16 only.
+ output[0] = _mm_add_epi16(step2[0], step1[15]);
+ output[1] = _mm_add_epi16(step2[1], step1[14]);
+ output[2] = _mm_add_epi16(step2[2], step2[13]);
+ output[3] = _mm_add_epi16(step2[3], step2[12]);
+ output[4] = _mm_add_epi16(step2[4], step2[11]);
+ output[5] = _mm_add_epi16(step2[5], step2[10]);
+ output[6] = _mm_add_epi16(step2[6], step1[9]);
+ output[7] = _mm_add_epi16(step2[7], step1[8]);
+ output[8] = _mm_sub_epi16(step2[7], step1[8]);
+ output[9] = _mm_sub_epi16(step2[6], step1[9]);
+ output[10] = _mm_sub_epi16(step2[5], step2[10]);
+ output[11] = _mm_sub_epi16(step2[4], step2[11]);
+ output[12] = _mm_sub_epi16(step2[3], step2[12]);
+ output[13] = _mm_sub_epi16(step2[2], step2[13]);
+ output[14] = _mm_sub_epi16(step2[1], step1[14]);
+ output[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/,
+ __m128i *const io /*io[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[16], step2[16];
+
+ transpose_16bit_4x8(l, io);
+
+ // stage 2
+ butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step1[0] = butterfly_cospi16(io[0]);
+ butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+
+ // stage 5
+ butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm_add_epi16(step1[0], step1[6]);
+ step2[2] = _mm_add_epi16(step1[0], step1[5]);
+ step2[3] = _mm_add_epi16(step1[0], step1[4]);
+ step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+ step2[5] = _mm_sub_epi16(step1[0], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+ step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ io[0] = _mm_add_epi16(step2[0], step1[15]);
+ io[1] = _mm_add_epi16(step2[1], step1[14]);
+ io[2] = _mm_add_epi16(step2[2], step2[13]);
+ io[3] = _mm_add_epi16(step2[3], step2[12]);
+ io[4] = _mm_add_epi16(step2[4], step2[11]);
+ io[5] = _mm_add_epi16(step2[5], step2[10]);
+ io[6] = _mm_add_epi16(step2[6], step1[9]);
+ io[7] = _mm_add_epi16(step2[7], step1[8]);
+ io[8] = _mm_sub_epi16(step2[7], step1[8]);
+ io[9] = _mm_sub_epi16(step2[6], step1[9]);
+ io[10] = _mm_sub_epi16(step2[5], step2[10]);
+ io[11] = _mm_sub_epi16(step2[4], step2[11]);
+ io[12] = _mm_sub_epi16(step2[3], step2[12]);
+ io[13] = _mm_sub_epi16(step2[2], step2[13]);
+ io[14] = _mm_sub_epi16(step2[1], step1[14]);
+ io[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct32_8x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+ &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi16(step1[16], step1[19]);
+ step2[17] = _mm_add_epi16(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi16(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi16(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi16(step1[23], step1[20]);
+ step2[21] = _mm_sub_epi16(step1[22], step1[21]);
+ step2[22] = _mm_add_epi16(step1[22], step1[21]);
+ step2[23] = _mm_add_epi16(step1[23], step1[20]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[27]);
+ step2[25] = _mm_add_epi16(step1[25], step1[26]);
+ step2[26] = _mm_sub_epi16(step1[25], step1[26]);
+ step2[27] = _mm_sub_epi16(step1[24], step1[27]);
+ step2[28] = _mm_sub_epi16(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi16(step1[30], step1[29]);
+ step2[30] = _mm_add_epi16(step1[29], step1[30]);
+ step2[31] = _mm_add_epi16(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+ &step1[29]);
+ butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+ &step1[28]);
+ butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+ &step1[27]);
+ butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+ &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ out[16] = _mm_add_epi16(step1[16], step1[23]);
+ out[17] = _mm_add_epi16(step1[17], step1[22]);
+ out[18] = _mm_add_epi16(step1[18], step1[21]);
+ out[19] = _mm_add_epi16(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi16(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi16(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi16(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi16(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi16(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi16(step1[28], step1[27]);
+ out[28] = _mm_add_epi16(step1[27], step1[28]);
+ out[29] = _mm_add_epi16(step1[26], step1[29]);
+ out[30] = _mm_add_epi16(step1[25], step1[30]);
+ out[31] = _mm_add_epi16(step1[24], step1[31]);
+
+ // stage 7
+ butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]);
+ butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]);
+ butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]);
+ butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]);
+}
+
+void idct4_sse2(__m128i *const in);
+void idct8_sse2(__m128i *const in);
+void idct16_sse2(__m128i *const in0, __m128i *const in1);
+void iadst4_sse2(__m128i *const in);
+void iadst8_sse2(__m128i *const in);
+void iadst16_sse2(__m128i *const in0, __m128i *const in1);
+void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out);
#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
index 4d2d95787..6e99469b6 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -12,1322 +12,353 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- // Load input data.
- in0 = load_input_data(input);
- in1 = load_input_data(input + 8 * 1);
- in2 = load_input_data(input + 8 * 2);
- in3 = load_input_data(input + 8 * 3);
- in4 = load_input_data(input + 8 * 4);
- in5 = load_input_data(input + 8 * 5);
- in6 = load_input_data(input + 8 * 6);
- in7 = load_input_data(input + 8 * 7);
-
- // 2-D
- for (i = 0; i < 2; i++) {
- // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
-
- // 4-stage 1D idct8x8
- {
- /* Stage1 */
- {
- const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
- const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
- const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
- const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
-
- {
- tmp0 = _mm_madd_epi16(lo_17, stg1_0);
- tmp1 = _mm_madd_epi16(hi_17, stg1_0);
- tmp2 = _mm_madd_epi16(lo_17, stg1_1);
- tmp3 = _mm_madd_epi16(hi_17, stg1_1);
- tmp4 = _mm_madd_epi16(lo_35, stg1_2);
- tmp5 = _mm_madd_epi16(hi_35, stg1_2);
- tmp6 = _mm_madd_epi16(lo_35, stg1_3);
- tmp7 = _mm_madd_epi16(hi_35, stg1_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, 14);
- tmp1 = _mm_srai_epi32(tmp1, 14);
- tmp2 = _mm_srai_epi32(tmp2, 14);
- tmp3 = _mm_srai_epi32(tmp3, 14);
- tmp4 = _mm_srai_epi32(tmp4, 14);
- tmp5 = _mm_srai_epi32(tmp5, 14);
- tmp6 = _mm_srai_epi32(tmp6, 14);
- tmp7 = _mm_srai_epi32(tmp7, 14);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp1);
- stp1_7 = _mm_packs_epi32(tmp2, tmp3);
- stp1_5 = _mm_packs_epi32(tmp4, tmp5);
- stp1_6 = _mm_packs_epi32(tmp6, tmp7);
- }
- }
-
- /* Stage2 */
- {
- const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
- const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
-
- {
- tmp0 = _mm_unpacklo_epi16(in0, in4);
- tmp1 = _mm_unpackhi_epi16(in0, in4);
-
- tmp2 = _mm_madd_epi16(tmp0, stk2_0);
- tmp3 = _mm_madd_epi16(tmp1, stk2_0);
- tmp4 = _mm_madd_epi16(tmp0, stk2_1);
- tmp5 = _mm_madd_epi16(tmp1, stk2_1);
-
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
-
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
- stp2_0 = _mm_packs_epi32(tmp2, tmp3);
- stp2_1 = _mm_packs_epi32(tmp4, tmp5);
-
- tmp0 = _mm_madd_epi16(lo_26, stg2_2);
- tmp1 = _mm_madd_epi16(hi_26, stg2_2);
- tmp2 = _mm_madd_epi16(lo_26, stg2_3);
- tmp3 = _mm_madd_epi16(hi_26, stg2_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, 14);
- tmp1 = _mm_srai_epi32(tmp1, 14);
- tmp2 = _mm_srai_epi32(tmp2, 14);
- tmp3 = _mm_srai_epi32(tmp3, 14);
-
- stp2_2 = _mm_packs_epi32(tmp0, tmp1);
- stp2_3 = _mm_packs_epi32(tmp2, tmp3);
- }
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
- }
-
- /* Stage3 */
- {
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
- tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
- tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
- tmp2 = _mm_madd_epi16(tmp0, stk2_1);
- tmp3 = _mm_madd_epi16(tmp1, stk2_1);
- tmp4 = _mm_madd_epi16(tmp0, stk2_0);
- tmp5 = _mm_madd_epi16(tmp1, stk2_0);
-
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
-
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp2, tmp3);
- stp1_6 = _mm_packs_epi32(tmp4, tmp5);
- }
-
- /* Stage4 */
- in0 = _mm_add_epi16(stp1_0, stp2_7);
- in1 = _mm_add_epi16(stp1_1, stp1_6);
- in2 = _mm_add_epi16(stp1_2, stp1_5);
- in3 = _mm_add_epi16(stp1_3, stp2_4);
- in4 = _mm_sub_epi16(stp1_3, stp2_4);
- in5 = _mm_sub_epi16(stp1_2, stp1_5);
- in6 = _mm_sub_epi16(stp1_1, stp1_6);
- in7 = _mm_sub_epi16(stp1_0, stp2_7);
- }
- }
-
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
+static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0,
+ const int c1, __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i cst0 = _mm_set1_epi16(2 * c0);
+ const __m128i cst1 = _mm_set1_epi16(2 * c1);
+ *out0 = _mm_mulhrs_epi16(in, cst0);
+ *out1 = _mm_mulhrs_epi16(in, cst1);
+}
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
+static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) {
+ const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64);
+ return _mm_mulhrs_epi16(in, coef_pair);
}
void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
- const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
- const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
- const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
- const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
- const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
- const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3;
-
- // Rows. Load 4-row input data.
- in0 = load_input_data(input);
- in1 = load_input_data(input + 8 * 1);
- in2 = load_input_data(input + 8 * 2);
- in3 = load_input_data(input + 8 * 3);
-
- // 8x4 Transpose
- TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-
- // Stage1
- tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
- tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
- tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
- tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
-
- stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
- stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
-
- // Stage2
- tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
- stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
-
- tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
- tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
- stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
-
- tmp0 = _mm_add_epi16(stp1_4, stp1_5);
- tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp0;
- stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-
- tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
- tmp1 = _mm_madd_epi16(tmp0, stg3_0);
- tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0
-
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp1, tmp2);
-
- // Stage3
- tmp2 = _mm_add_epi16(stp2_0, stp2_2);
- tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
+ __m128i io[8];
- stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
- stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
-
- // Stage4
- tmp0 = _mm_add_epi16(stp1_3, stp2_4);
- tmp1 = _mm_add_epi16(stp1_2, stp1_5);
- tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
- tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
- TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
- /* Stage1 */
- stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
- stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
- stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
- stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
-
- /* Stage2 */
- stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
- stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
-
- stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
- stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
- /* Stage3 */
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
- tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
- tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
- tmp2 = _mm_madd_epi16(tmp0, stk2_0);
- tmp3 = _mm_madd_epi16(tmp1, stk2_0);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
- tmp2 = _mm_madd_epi16(tmp0, stk2_1);
- tmp3 = _mm_madd_epi16(tmp1, stk2_1);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-
- /* Stage4 */
- in0 = _mm_add_epi16(stp1_0, stp2_7);
- in1 = _mm_add_epi16(stp1_1, stp1_6);
- in2 = _mm_add_epi16(stp1_2, stp1_5);
- in3 = _mm_add_epi16(stp1_3, stp2_4);
- in4 = _mm_sub_epi16(stp1_3, stp2_4);
- in5 = _mm_sub_epi16(stp1_2, stp1_5);
- in6 = _mm_sub_epi16(stp1_1, stp1_6);
- in7 = _mm_sub_epi16(stp1_0, stp2_7);
-
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
+ io[0] = load_input_data4(input + 0 * 8);
+ io[1] = load_input_data4(input + 1 * 8);
+ io[2] = load_input_data4(input + 2 * 8);
+ io[3] = load_input_data4(input + 3 * 8);
-// Only do addition and subtraction butterfly, size = 16, 32
-static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
- int size) {
- int i = 0;
- const int num = size >> 1;
- const int bound = size - 1;
- while (i < num) {
- out[i] = _mm_add_epi16(in[i], in[bound - i]);
- out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
- i++;
- }
+ idct8x8_12_add_kernel_ssse3(io);
+ write_buffer_8x8(io, dest, stride);
}
-#define BUTTERFLY_PAIR(x0, x1, co0, co1) \
- do { \
- tmp0 = _mm_madd_epi16(x0, co0); \
- tmp1 = _mm_madd_epi16(x1, co0); \
- tmp2 = _mm_madd_epi16(x0, co1); \
- tmp3 = _mm_madd_epi16(x1, co1); \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- } while (0)
-
-static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
- const __m128i *c0, const __m128i *c1, __m128i *y0,
- __m128i *y1) {
- __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
- u0 = _mm_unpacklo_epi16(*x0, *x1);
- u1 = _mm_unpackhi_epi16(*x0, *x1);
- BUTTERFLY_PAIR(u0, u1, *c0, *c1);
- *y0 = _mm_packs_epi32(tmp0, tmp1);
- *y1 = _mm_packs_epi32(tmp2, tmp3);
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[0];
+ step1[2] = step2[0];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
}
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
- const __m128i *c1) {
- __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- u0 = _mm_unpacklo_epi16(*x0, *x1);
- u1 = _mm_unpackhi_epi16(*x0, *x1);
- BUTTERFLY_PAIR(u0, u1, *c0, *c1);
- *x0 = _mm_packs_epi32(tmp0, tmp1);
- *x1 = _mm_packs_epi32(tmp2, tmp3);
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
}
-static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
- const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
- const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
- const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
- const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
- const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
- const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i x0, x1, x4, x5, x6, x7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
- // phase 1
-
- // 0, 15
- u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15
- u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12
- v15 = _mm_add_epi16(u2, u3);
- // in[0], in[4]
- x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0]
- x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7]
- v0 = _mm_add_epi16(x0, x7); // stp2_0
- stp1[0] = _mm_add_epi16(v0, v15);
- stp1[15] = _mm_sub_epi16(v0, v15);
-
- // in[2], in[6]
- u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8
- u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11
- butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14
- butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13
-
- v8 = _mm_add_epi16(u0, u1);
- v9 = _mm_add_epi16(u4, u6);
- v10 = _mm_sub_epi16(u4, u6);
- v11 = _mm_sub_epi16(u0, u1);
- v12 = _mm_sub_epi16(u2, u3);
- v13 = _mm_sub_epi16(u5, u7);
- v14 = _mm_add_epi16(u5, u7);
-
- butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
- butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
- // 1, 14
- x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0
- // stp1[2] = stp1[0], stp1[3] = stp1[1]
- x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4]
- butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
- v1 = _mm_add_epi16(x1, x6); // stp2_1
- v2 = _mm_add_epi16(x0, x5); // stp2_2
- stp1[1] = _mm_add_epi16(v1, v14);
- stp1[14] = _mm_sub_epi16(v1, v14);
-
- stp1[2] = _mm_add_epi16(v2, v13);
- stp1[13] = _mm_sub_epi16(v2, v13);
-
- v3 = _mm_add_epi16(x1, x4); // stp2_3
- v4 = _mm_sub_epi16(x1, x4); // stp2_4
-
- v5 = _mm_sub_epi16(x0, x5); // stp2_5
-
- v6 = _mm_sub_epi16(x1, x6); // stp2_6
- v7 = _mm_sub_epi16(x0, x7); // stp2_7
- stp1[3] = _mm_add_epi16(v3, v12);
- stp1[12] = _mm_sub_epi16(v3, v12);
-
- stp1[6] = _mm_add_epi16(v6, v9);
- stp1[9] = _mm_sub_epi16(v6, v9);
-
- stp1[7] = _mm_add_epi16(v7, v8);
- stp1[8] = _mm_sub_epi16(v7, v8);
-
- stp1[4] = _mm_add_epi16(v4, v11);
- stp1[11] = _mm_sub_epi16(v4, v11);
-
- stp1[5] = _mm_add_epi16(v5, v10);
- stp1[10] = _mm_sub_epi16(v5, v10);
+static INLINE void idct32_34_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_34_8x32_quarter_1(in, temp);
+ idct32_34_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
}
-static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
- const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
- const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
- const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
- const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
- const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
- const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
- const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
- const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i v16, v17, v18, v19, v20, v21, v22, v23;
- __m128i v24, v25, v26, v27, v28, v29, v30, v31;
- __m128i u16, u17, u18, u19, u20, u21, u22, u23;
- __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
- v16 = _mm_mulhrs_epi16(in[1], stk1_0);
- v31 = _mm_mulhrs_epi16(in[1], stk1_1);
-
- v19 = _mm_mulhrs_epi16(in[7], stk1_6);
- v28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
- v20 = _mm_mulhrs_epi16(in[5], stk1_8);
- v27 = _mm_mulhrs_epi16(in[5], stk1_9);
-
- v23 = _mm_mulhrs_epi16(in[3], stk1_14);
- v24 = _mm_mulhrs_epi16(in[3], stk1_15);
-
- butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
- butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
- butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
- butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
- u16 = _mm_add_epi16(v16, v19);
- u17 = _mm_add_epi16(v17, v18);
- u18 = _mm_sub_epi16(v17, v18);
- u19 = _mm_sub_epi16(v16, v19);
- u20 = _mm_sub_epi16(v23, v20);
- u21 = _mm_sub_epi16(v22, v21);
- u22 = _mm_add_epi16(v22, v21);
- u23 = _mm_add_epi16(v23, v20);
- u24 = _mm_add_epi16(v24, v27);
- u27 = _mm_sub_epi16(v24, v27);
- u25 = _mm_add_epi16(v25, v26);
- u26 = _mm_sub_epi16(v25, v26);
- u28 = _mm_sub_epi16(v31, v28);
- u31 = _mm_add_epi16(v28, v31);
- u29 = _mm_sub_epi16(v30, v29);
- u30 = _mm_add_epi16(v29, v30);
-
- butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
- butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
- butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
- butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
- stp1[16] = _mm_add_epi16(u16, u23);
- stp1[23] = _mm_sub_epi16(u16, u23);
-
- stp1[17] = _mm_add_epi16(u17, u22);
- stp1[22] = _mm_sub_epi16(u17, u22);
-
- stp1[18] = _mm_add_epi16(u18, u21);
- stp1[21] = _mm_sub_epi16(u18, u21);
-
- stp1[19] = _mm_add_epi16(u19, u20);
- stp1[20] = _mm_sub_epi16(u19, u20);
-
- stp1[24] = _mm_sub_epi16(u31, u24);
- stp1[31] = _mm_add_epi16(u24, u31);
-
- stp1[25] = _mm_sub_epi16(u30, u25);
- stp1[30] = _mm_add_epi16(u25, u30);
-
- stp1[26] = _mm_sub_epi16(u29, u26);
- stp1[29] = _mm_add_epi16(u26, u29);
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32];
+
+ // stage 1
+ partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+ partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 3
+ butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
- stp1[27] = _mm_sub_epi16(u28, u27);
- stp1[28] = _mm_add_epi16(u27, u28);
+void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
- butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
- butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
- butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
- butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
+ idct32_34_8x32_quarter_1_2(in, temp);
+ idct32_34_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
}
// Only upper-left 8x8 has non-zero coeff
void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- __m128i in[32], col[32];
- __m128i stp1[32];
+ __m128i io[32], col[32];
int i;
// Load input data. Only need to load the top left 8x8 block.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 32);
- in[2] = load_input_data(input + 64);
- in[3] = load_input_data(input + 96);
- in[4] = load_input_data(input + 128);
- in[5] = load_input_data(input + 160);
- in[6] = load_input_data(input + 192);
- in[7] = load_input_data(input + 224);
+ load_transpose_16bit_8x8(input, 32, io);
+ idct32_34_8x32_ssse3(io, col);
- array_transpose_8x8(in, in);
- idct32_34_first_half(in, stp1);
- idct32_34_second_half(in, stp1);
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- add_sub_butterfly(stp1, col, 32);
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 32; i += 8) {
int j;
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + i * 8, in);
- idct32_34_first_half(in, stp1);
- idct32_34_second_half(in, stp1);
+ transpose_16bit_8x8(col + i, io);
+ idct32_34_8x32_ssse3(io, io);
- // 2_D: Calculate the results and store them to destination.
- add_sub_butterfly(stp1, in, 32);
for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
+ write_buffer_8x1(dest + j * stride, io[j]);
}
dest += 8;
}
}
-// in0[16] represents the left 8x16 block
-// in1[16] represents the right 8x16 block
-static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
- __m128i *in1) {
- int i;
- for (i = 0; i < 16; i++) {
- in0[i] = load_input_data(input);
- in1[i] = load_input_data(input + 8);
- input += 32;
- }
-}
-
-static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
- __m128i *out1) {
- array_transpose_8x8(in0, out0);
- array_transpose_8x8(&in0[8], out1);
- array_transpose_8x8(in1, &out0[8]);
- array_transpose_8x8(&in1[8], &out1[8]);
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
- __m128i *out /*out[8]*/) {
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
- {
- const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
- const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
- const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
- u0 = _mm_mulhrs_epi16(in[0], stk4_0);
- u2 = _mm_mulhrs_epi16(in[8], stk4_2);
- u3 = _mm_mulhrs_epi16(in[8], stk4_3);
- u1 = u0;
- }
-
- v0 = _mm_add_epi16(u0, u3);
- v1 = _mm_add_epi16(u1, u2);
- v2 = _mm_sub_epi16(u1, u2);
- v3 = _mm_sub_epi16(u0, u3);
-
- {
- const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
- const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
- const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
- const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
- u4 = _mm_mulhrs_epi16(in[4], stk3_0);
- u7 = _mm_mulhrs_epi16(in[4], stk3_1);
- u5 = _mm_mulhrs_epi16(in[12], stk3_2);
- u6 = _mm_mulhrs_epi16(in[12], stk3_3);
- }
-
- v4 = _mm_add_epi16(u4, u5);
- v5 = _mm_sub_epi16(u4, u5);
- v6 = _mm_sub_epi16(u7, u6);
- v7 = _mm_add_epi16(u7, u6);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
- }
-
- out[0] = _mm_add_epi16(v0, v7);
- out[1] = _mm_add_epi16(v1, v6);
- out[2] = _mm_add_epi16(v2, v5);
- out[3] = _mm_add_epi16(v3, v4);
- out[4] = _mm_sub_epi16(v3, v4);
- out[5] = _mm_sub_epi16(v2, v5);
- out[6] = _mm_sub_epi16(v1, v6);
- out[7] = _mm_sub_epi16(v0, v7);
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+ partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
}
-static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
- __m128i *out /*out[8]*/) {
- __m128i u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v8, v9, v10, v11, v12, v13, v14, v15;
-
- {
- const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
- const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
- const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
- const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
- const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
- const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
- const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
- const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
- u8 = _mm_mulhrs_epi16(in[2], stk2_0);
- u15 = _mm_mulhrs_epi16(in[2], stk2_1);
- u9 = _mm_mulhrs_epi16(in[14], stk2_2);
- u14 = _mm_mulhrs_epi16(in[14], stk2_3);
- u10 = _mm_mulhrs_epi16(in[10], stk2_4);
- u13 = _mm_mulhrs_epi16(in[10], stk2_5);
- u11 = _mm_mulhrs_epi16(in[6], stk2_6);
- u12 = _mm_mulhrs_epi16(in[6], stk2_7);
- }
-
- v8 = _mm_add_epi16(u8, u9);
- v9 = _mm_sub_epi16(u8, u9);
- v10 = _mm_sub_epi16(u11, u10);
- v11 = _mm_add_epi16(u11, u10);
- v12 = _mm_add_epi16(u12, u13);
- v13 = _mm_sub_epi16(u12, u13);
- v14 = _mm_sub_epi16(u15, u14);
- v15 = _mm_add_epi16(u15, u14);
-
- {
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
- butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
- }
-
- out[0] = _mm_add_epi16(v8, v11);
- out[1] = _mm_add_epi16(v9, v10);
- out[2] = _mm_sub_epi16(v9, v10);
- out[3] = _mm_sub_epi16(v8, v11);
- out[4] = _mm_sub_epi16(v15, v12);
- out[5] = _mm_sub_epi16(v14, v13);
- out[6] = _mm_add_epi16(v14, v13);
- out[7] = _mm_add_epi16(v15, v12);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
- butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
- }
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
}
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
- __m128i *out /*out[32]*/) {
+static INLINE void idct32_135_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
__m128i temp[16];
- idct32_8x32_135_quarter_1(in, temp);
- idct32_8x32_135_quarter_2(in, &temp[8]);
+ idct32_135_8x32_quarter_1(in, temp);
+ idct32_135_8x32_quarter_2(in, temp);
+ // stage 7
add_sub_butterfly(temp, out, 16);
}
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
- __m128i *out /*out[32]*/) {
- __m128i v16, v17, v18, v19, v20, v21, v22, v23;
- __m128i v24, v25, v26, v27, v28, v29, v30, v31;
- __m128i u16, u17, u18, u19, u20, u21, u22, u23;
- __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
- {
- const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
- const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
- const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
- const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
- const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
- const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
- const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
- const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
- const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
- const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
- const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
- const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
- const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
- const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
- const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
- const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
- u16 = _mm_mulhrs_epi16(in[1], stk1_0);
- u31 = _mm_mulhrs_epi16(in[1], stk1_1);
- u17 = _mm_mulhrs_epi16(in[15], stk1_2);
- u30 = _mm_mulhrs_epi16(in[15], stk1_3);
-
- u18 = _mm_mulhrs_epi16(in[9], stk1_4);
- u29 = _mm_mulhrs_epi16(in[9], stk1_5);
- u19 = _mm_mulhrs_epi16(in[7], stk1_6);
- u28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
- u20 = _mm_mulhrs_epi16(in[5], stk1_8);
- u27 = _mm_mulhrs_epi16(in[5], stk1_9);
- u21 = _mm_mulhrs_epi16(in[11], stk1_10);
- u26 = _mm_mulhrs_epi16(in[11], stk1_11);
-
- u22 = _mm_mulhrs_epi16(in[13], stk1_12);
- u25 = _mm_mulhrs_epi16(in[13], stk1_13);
- u23 = _mm_mulhrs_epi16(in[3], stk1_14);
- u24 = _mm_mulhrs_epi16(in[3], stk1_15);
- }
-
- v16 = _mm_add_epi16(u16, u17);
- v17 = _mm_sub_epi16(u16, u17);
- v18 = _mm_sub_epi16(u19, u18);
- v19 = _mm_add_epi16(u19, u18);
-
- v20 = _mm_add_epi16(u20, u21);
- v21 = _mm_sub_epi16(u20, u21);
- v22 = _mm_sub_epi16(u23, u22);
- v23 = _mm_add_epi16(u23, u22);
-
- v24 = _mm_add_epi16(u24, u25);
- v25 = _mm_sub_epi16(u24, u25);
- v26 = _mm_sub_epi16(u27, u26);
- v27 = _mm_add_epi16(u27, u26);
-
- v28 = _mm_add_epi16(u28, u29);
- v29 = _mm_sub_epi16(u28, u29);
- v30 = _mm_sub_epi16(u31, u30);
- v31 = _mm_add_epi16(u31, u30);
-
- {
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
- butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
- butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
- butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
- }
-
- u16 = _mm_add_epi16(v16, v19);
- u17 = _mm_add_epi16(v17, v18);
- u18 = _mm_sub_epi16(v17, v18);
- u19 = _mm_sub_epi16(v16, v19);
- u20 = _mm_sub_epi16(v23, v20);
- u21 = _mm_sub_epi16(v22, v21);
- u22 = _mm_add_epi16(v22, v21);
- u23 = _mm_add_epi16(v23, v20);
-
- u24 = _mm_add_epi16(v24, v27);
- u25 = _mm_add_epi16(v25, v26);
- u26 = _mm_sub_epi16(v25, v26);
- u27 = _mm_sub_epi16(v24, v27);
- u28 = _mm_sub_epi16(v31, v28);
- u29 = _mm_sub_epi16(v30, v29);
- u30 = _mm_add_epi16(v29, v30);
- u31 = _mm_add_epi16(v28, v31);
-
- {
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
- butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
- butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
- butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
- }
-
- out[0] = _mm_add_epi16(u16, u23);
- out[1] = _mm_add_epi16(u17, u22);
- out[2] = _mm_add_epi16(u18, u21);
- out[3] = _mm_add_epi16(u19, u20);
- v20 = _mm_sub_epi16(u19, u20);
- v21 = _mm_sub_epi16(u18, u21);
- v22 = _mm_sub_epi16(u17, u22);
- v23 = _mm_sub_epi16(u16, u23);
-
- v24 = _mm_sub_epi16(u31, u24);
- v25 = _mm_sub_epi16(u30, u25);
- v26 = _mm_sub_epi16(u29, u26);
- v27 = _mm_sub_epi16(u28, u27);
- out[12] = _mm_add_epi16(u27, u28);
- out[13] = _mm_add_epi16(u26, u29);
- out[14] = _mm_add_epi16(u25, u30);
- out[15] = _mm_add_epi16(u24, u31);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
- butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
- butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
- butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
- }
-}
-
-// 8x16 block, input __m128i in[16], output __m128i in[32]
-static void idct32_8x32_135(__m128i *in /*in[32]*/) {
- __m128i out[32];
- idct32_8x32_quarter_1_2(in, out);
- idct32_8x32_quarter_3_4(in, &out[16]);
- add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
- int j = 0;
- while (j < 32) {
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
-
- in[j] = _mm_srai_epi16(in[j], 6);
- in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
-
- RECON_AND_STORE(dst, in[j]);
- dst += stride;
- RECON_AND_STORE(dst, in[j + 1]);
- dst += stride;
- j += 2;
- }
-}
-
-static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
- int stride) {
- store_buffer_8x32(in0, dest, stride);
- store_buffer_8x32(in1, dest + 8, stride);
-}
-
-static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
- idct32_8x32_135(col0);
- idct32_8x32_135(col1);
-}
-
-typedef enum { left_16, right_16 } ColsIndicator;
-
-static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
- ColsIndicator cols) {
- switch (cols) {
- case left_16: {
- int i;
- array_transpose_16x16(in0, in1);
- for (i = 0; i < 16; ++i) {
- store[i] = in0[16 + i];
- store[16 + i] = in1[16 + i];
- }
- break;
- }
- case right_16: {
- array_transpose_16x16_2(store, &store[16], in0, in1);
- break;
- }
- default: { assert(0); }
- }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
- int stride) {
- // Each array represents an 8x32 block
- __m128i col0[32], col1[32];
- // This array represents a 16x16 block
- __m128i temp[32];
-
- // Load input data. Only need to load the top left 16x16 block.
- load_buffer_16x16(input, col0, col1);
-
- // columns
- array_transpose_16x16(col0, col1);
- idct32_135(col0, col1);
-
- // rows
- transpose_and_copy_16x16(col0, col1, temp, left_16);
- idct32_135(col0, col1);
- recon_and_store(col0, col1, dest, stride);
-
- transpose_and_copy_16x16(col0, col1, temp, right_16);
- idct32_135(col0, col1);
- recon_and_store(col0, col1, dest + 16, stride);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m128i in[32]
-static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
- __m128i *out /*out[16]*/) {
- __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_
- __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_
-
- {
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
- butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
- }
-
- v8 = _mm_add_epi16(u8, u9);
- v9 = _mm_sub_epi16(u8, u9);
- v14 = _mm_sub_epi16(u15, u14);
- v15 = _mm_add_epi16(u15, u14);
-
- {
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
- butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
- butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
- }
-
- v10 = _mm_sub_epi16(u11, u10);
- v11 = _mm_add_epi16(u11, u10);
- v12 = _mm_add_epi16(u12, u13);
- v13 = _mm_sub_epi16(u12, u13);
-
- {
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
- butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
- }
-
- out[0] = _mm_add_epi16(v8, v11);
- out[1] = _mm_add_epi16(v9, v10);
- out[6] = _mm_add_epi16(v14, v13);
- out[7] = _mm_add_epi16(v15, v12);
-
- out[2] = _mm_sub_epi16(v9, v10);
- out[3] = _mm_sub_epi16(v8, v11);
- out[4] = _mm_sub_epi16(v15, v12);
- out[5] = _mm_sub_epi16(v14, v13);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
- butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
- }
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m128i in[32]
-static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
- __m128i *out /*out[8]*/) {
- __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_
- __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_
-
- {
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
- butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
- butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
- }
-
- v4 = _mm_add_epi16(u4, u5);
- v5 = _mm_sub_epi16(u4, u5);
- v6 = _mm_sub_epi16(u7, u6);
- v7 = _mm_add_epi16(u7, u6);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
- butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
- butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
- }
-
- v0 = _mm_add_epi16(u0, u3);
- v1 = _mm_add_epi16(u1, u2);
- v2 = _mm_sub_epi16(u1, u2);
- v3 = _mm_sub_epi16(u0, u3);
-
- out[0] = _mm_add_epi16(v0, v7);
- out[1] = _mm_add_epi16(v1, v6);
- out[2] = _mm_add_epi16(v2, v5);
- out[3] = _mm_add_epi16(v3, v4);
- out[4] = _mm_sub_epi16(v3, v4);
- out[5] = _mm_sub_epi16(v2, v5);
- out[6] = _mm_sub_epi16(v1, v6);
- out[7] = _mm_sub_epi16(v0, v7);
-}
-
// For each 8x32 block __m128i in[32],
// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m128i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
- __m128i *out /*out[16]*/) {
- __m128i v16, v17, v18, v19, v20, v21, v22, v23;
- __m128i v24, v25, v26, v27, v28, v29, v30, v31;
- __m128i u16, u17, u18, u19, u20, u21, u22, u23;
- __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
- {
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
- butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
- butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
- butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
- butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
- butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
- butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
- butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
- butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
- }
-
- v16 = _mm_add_epi16(u16, u17);
- v17 = _mm_sub_epi16(u16, u17);
- v18 = _mm_sub_epi16(u19, u18);
- v19 = _mm_add_epi16(u19, u18);
-
- v20 = _mm_add_epi16(u20, u21);
- v21 = _mm_sub_epi16(u20, u21);
- v22 = _mm_sub_epi16(u23, u22);
- v23 = _mm_add_epi16(u23, u22);
-
- v24 = _mm_add_epi16(u24, u25);
- v25 = _mm_sub_epi16(u24, u25);
- v26 = _mm_sub_epi16(u27, u26);
- v27 = _mm_add_epi16(u27, u26);
-
- v28 = _mm_add_epi16(u28, u29);
- v29 = _mm_sub_epi16(u28, u29);
- v30 = _mm_sub_epi16(u31, u30);
- v31 = _mm_add_epi16(u31, u30);
-
- {
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
- butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
- butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
- butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
- }
-
- u16 = _mm_add_epi16(v16, v19);
- u17 = _mm_add_epi16(v17, v18);
- u18 = _mm_sub_epi16(v17, v18);
- u19 = _mm_sub_epi16(v16, v19);
- u20 = _mm_sub_epi16(v23, v20);
- u21 = _mm_sub_epi16(v22, v21);
- u22 = _mm_add_epi16(v22, v21);
- u23 = _mm_add_epi16(v23, v20);
-
- u24 = _mm_add_epi16(v24, v27);
- u25 = _mm_add_epi16(v25, v26);
- u26 = _mm_sub_epi16(v25, v26);
- u27 = _mm_sub_epi16(v24, v27);
-
- u28 = _mm_sub_epi16(v31, v28);
- u29 = _mm_sub_epi16(v30, v29);
- u30 = _mm_add_epi16(v29, v30);
- u31 = _mm_add_epi16(v28, v31);
-
- {
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
- butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
- butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
- butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
- }
-
- out[0] = _mm_add_epi16(u16, u23);
- out[1] = _mm_add_epi16(u17, u22);
- out[2] = _mm_add_epi16(u18, u21);
- out[3] = _mm_add_epi16(u19, u20);
- out[4] = _mm_sub_epi16(u19, u20);
- out[5] = _mm_sub_epi16(u18, u21);
- out[6] = _mm_sub_epi16(u17, u22);
- out[7] = _mm_sub_epi16(u16, u23);
-
- out[8] = _mm_sub_epi16(u31, u24);
- out[9] = _mm_sub_epi16(u30, u25);
- out[10] = _mm_sub_epi16(u29, u26);
- out[11] = _mm_sub_epi16(u28, u27);
- out[12] = _mm_add_epi16(u27, u28);
- out[13] = _mm_add_epi16(u26, u29);
- out[14] = _mm_add_epi16(u25, u30);
- out[15] = _mm_add_epi16(u24, u31);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
- butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
- butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
- butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
- }
-}
-
-static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
- __m128i *out /*out[32]*/) {
- __m128i temp[16];
- idct32_full_8x32_quarter_1(in, temp);
- idct32_full_8x32_quarter_2(in, &temp[8]);
- add_sub_butterfly(temp, out, 16);
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+ &step1[30]);
+ partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+ &step1[26]);
+
+ partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
}
-static void idct32_full_8x32(const __m128i *in /*in[32]*/,
- __m128i *out /*out[32]*/) {
+void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
__m128i temp[32];
- idct32_full_8x32_quarter_1_2(in, temp);
- idct32_full_8x32_quarter_3_4(in, &temp[16]);
+ idct32_135_8x32_quarter_1_2(in, temp);
+ idct32_135_8x32_quarter_3_4(in, temp);
+ // final stage
add_sub_butterfly(temp, out, 32);
}
-static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[2][32], io[32];
int i;
- for (i = 0; i < 8; ++i) {
- in[i] = load_input_data(input);
- in[i + 8] = load_input_data(input + 8);
- in[i + 16] = load_input_data(input + 16);
- in[i + 24] = load_input_data(input + 24);
- input += 32;
- }
-}
-
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
- int stride) {
- __m128i col[128], in[32];
- int i, j;
// rows
- for (i = 0; i < 4; ++i) {
- load_buffer_8x32(input, in);
+ for (i = 0; i < 2; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+ idct32_135_8x32_ssse3(io, col[i]);
input += 32 << 3;
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- idct32_full_8x32(in, col + (i << 5));
}
// columns
- for (i = 0; i < 4; ++i) {
- j = i << 3;
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + j, in);
- array_transpose_8x8(col + j + 32, in + 8);
- array_transpose_8x8(col + j + 64, in + 16);
- array_transpose_8x8(col + j + 96, in + 24);
-
- idct32_full_8x32(in, in);
- store_buffer_8x32(in, dest, stride);
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ idct32_135_8x32_ssse3(io, io);
+ store_buffer_8x32(io, dest, stride);
dest += 8;
}
}
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
new file mode 100644
index 000000000..e785c8eda
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+ const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+ const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+ const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+ const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
+ const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
+ const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
+ const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
+ const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
+ const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
+ const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
+ __m128i step1[8], step2[8], tmp[4];
+
+ // pass 1
+
+ transpose_16bit_4x4(io, io);
+ // io[0]: 00 10 20 30 01 11 21 31
+ // io[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+ tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+ tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+ tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+ step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7
+ step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1
+ step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6
+
+ // stage 3
+ tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ // pass 2
+
+ idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+ // stage 1
+ step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+ step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+ step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+ step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0]
+ step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+ step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+ // stage 4
+ io[0] = _mm_add_epi16(step1[0], step2[7]);
+ io[1] = _mm_add_epi16(step1[1], step1[6]);
+ io[2] = _mm_add_epi16(step1[2], step1[5]);
+ io[3] = _mm_add_epi16(step1[3], step2[4]);
+ io[4] = _mm_sub_epi16(step1[3], step2[4]);
+ io[5] = _mm_sub_epi16(step1[2], step1[5]);
+ io[6] = _mm_sub_epi16(step1[1], step1[6]);
+ io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif // VPX_DSP_X86_INV_TXFM_SSSE3_H_
diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h
new file mode 100644
index 000000000..2ce738fb7
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_MEM_SSE2_H_
+#define VPX_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+ d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+ d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+ d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+ d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+ d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+ d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+ loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+ _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+ *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+ *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+ *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ __m128i ss[4];
+
+ ss[0] = s;
+ ss[1] = _mm_srli_si128(s, 4);
+ ss[2] = _mm_srli_si128(s, 8);
+ ss[3] = _mm_srli_si128(s, 12);
+ store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+ uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+ _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+ _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+ _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+ _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+ _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif // VPX_DSP_X86_MEM_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c
new file mode 100644
index 000000000..6f4489004
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan_ptr;
+ (void)skip_block;
+ assert(!skip_block);
+
+ *eob_ptr = 0;
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (n_coeffs == 16) return;
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_tran_low(coeff0, dqcoeff_ptr);
+ store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+ zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < n_coeffs; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_tran_low(coeff0, dqcoeff_ptr + index);
+ store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_avx(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan_ptr;
+ (void)n_coeffs;
+ (void)skip_block;
+ assert(!skip_block);
+
+ // Setup global values.
+ // The 32x32 halves zbin and round.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, one);
+ zbin = _mm_srli_epi16(zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ zbin = _mm_sub_epi16(zbin, one);
+
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ round = _mm_add_epi16(round, one);
+ round = _mm_srli_epi16(round, 1);
+
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+ shift = _mm_slli_epi16(shift, 1);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ // Un-sign to bias rounding like C.
+ // dequant is almost always negative, so this is probably the backwards way
+ // to handle the sign. However, it matches the previous assembly.
+ coeff0 = _mm_abs_epi16(qcoeff0);
+ coeff1 = _mm_abs_epi16(qcoeff1);
+
+ coeff0 = calculate_dqcoeff(coeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+ // "Divide" by 2.
+ coeff0 = _mm_srli_epi16(coeff0, 1);
+ coeff1 = _mm_srli_epi16(coeff1, 1);
+
+ coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+ coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+ store_tran_low(coeff0, dqcoeff_ptr);
+ store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+ zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = _mm_abs_epi16(qcoeff0);
+ coeff1 = _mm_abs_epi16(qcoeff1);
+
+ coeff0 = calculate_dqcoeff(coeff0, dequant);
+ coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+ coeff0 = _mm_srli_epi16(coeff0, 1);
+ coeff1 = _mm_srli_epi16(coeff1, 1);
+
+ coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+ coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+ store_tran_low(coeff0, dqcoeff_ptr + index);
+ store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index 01c41291b..000000000
--- a/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,544 +0,0 @@
-;
-; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, \
- eob, scan, iscan
-
- vzeroupper
-
- ; If we can skip this block, then just zero the output
- cmp skipmp, 0
- jne .blank
-
-%ifnidn %1, b_32x32
-
- ; Special case for ncoeff == 16, as it is frequent and we can save on
- ; not setting up a loop.
- cmp ncoeffmp, 16
- jne .generic
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Special case of ncoeff == 16
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
- movifnidn coeffq, coeffmp
- movifnidn zbinq, zbinmp
- mova m0, [zbinq] ; m0 = zbin
-
- ; Get DC and first 15 AC coeffs - in this special case, that is all.
-%if CONFIG_VP9_HIGHBITDEPTH
- ; coeff stored as 32bit numbers but we process them as 16 bit numbers
- mova m9, [coeffq]
- packssdw m9, [coeffq+16] ; m9 = c[i]
- mova m10, [coeffq+32]
- packssdw m10, [coeffq+48] ; m10 = c[i]
-%else
- mova m9, [coeffq] ; m9 = c[i]
- mova m10, [coeffq+16] ; m10 = c[i]
-%endif
-
- mov r0, eobmp ; Output pointer
- mov r1, qcoeffmp ; Output pointer
- mov r2, dqcoeffmp ; Output pointer
-
- pxor m5, m5 ; m5 = dedicated zero
-
- pcmpeqw m4, m4 ; All word lanes -1
- paddw m0, m4 ; m0 = zbin - 1
-
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- punpckhqdq m0, m0
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-
- ; Check if all coeffs are less than zbin. If yes, we just write zeros
- ; to the outputs and we are done.
- por m14, m7, m12
- ptest m14, m14
- jnz .single_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
- mova [r1 ], ymm5
- mova [r1+32], ymm5
- mova [r2 ], ymm5
- mova [r2+32], ymm5
-%else
- mova [r1], ymm5
- mova [r2], ymm5
-%endif
- mov [r0], word 0
-
- vzeroupper
- RET
-
-.single_nonzero:
-
- ; Actual quantization of size 16 block - setup pointers, rounders, etc.
- movifnidn r4, roundmp
- movifnidn r5, quantmp
- mov r3, dequantmp
- mov r6, shiftmp
- mova m1, [r4] ; m1 = round
- mova m2, [r5] ; m2 = quant
- mova m3, [r3] ; m3 = dequant
- mova m4, [r6] ; m4 = shift
-
- mov r3, iscanmp
-
- DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m8, m6 ; m8 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m8, m4 ; m8 = m8*qsh>>16
- punpckhqdq m4, m4
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m8, m7
- pand m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
- ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [qcoeffq ], m11
- mova [qcoeffq+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [qcoeffq+32], m11
- mova [qcoeffq+48], m6
-%else
- mova [qcoeffq ], m8
- mova [qcoeffq+16], m13
-%endif
-
- pmullw m8, m3 ; dqc[i] = qc[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-
-%if CONFIG_VP9_HIGHBITDEPTH
- ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [dqcoeffq ], m11
- mova [dqcoeffq+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [dqcoeffq+32], m11
- mova [dqcoeffq+48], m6
-%else
- mova [dqcoeffq ], m8
- mova [dqcoeffq+16], m13
-%endif
-
- mova m6, [iscanq] ; m6 = scan[i]
- mova m11, [iscanq+16] ; m11 = scan[i]
-
- pcmpeqw m8, m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m13, m5 ; m13 = c[i] == 0
- psubw m6, m6, m7 ; m6 = scan[i] + 1
- psubw m11, m11, m12 ; m11 = scan[i] + 1
- pandn m8, m8, m6 ; m8 = max(eob)
- pandn m13, m13, m11 ; m13 = max(eob)
- pmaxsw m8, m8, m13
-
- ; Horizontally accumulate/max eobs and write into [eob] memory pointer
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- movq rax, m8
- mov [eobq], ax
-
- vzeroupper
- RET
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of ncoeff != 16
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
- qcoeff, dqcoeff, dequant, eob, scan, iscan
-
- ; Actual quantization loop - setup pointers, rounders, etc.
- movifnidn coeffq, coeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, dequantmp
- movifnidn zbinq, zbinmp
- movifnidn roundq, roundmp
- movifnidn quantq, quantmp
- mova m0, [zbinq] ; m0 = zbin
- mova m1, [roundq] ; m1 = round
- mova m2, [quantq] ; m2 = quant
- mova m3, [r2] ; m3 = dequant
- pcmpeqw m4, m4 ; All lanes -1
-%ifidn %1, b_32x32
- psubw m0, m4
- psubw m1, m4
- psrlw m0, 1 ; m0 = (m0 + 1) / 2
- psrlw m1, 1 ; m1 = (m1 + 1) / 2
-%endif
- paddw m0, m4 ; m0 = m0 + 1
-
- mov r2, shiftmp
- mov r3, qcoeffmp
- mova m4, [r2] ; m4 = shift
- mov r4, dqcoeffmp
- mov r5, iscanmp
-%ifidn %1, b_32x32
- psllw m4, 1
-%endif
- pxor m5, m5 ; m5 = dedicated zero
-
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
- lea coeffq, [ coeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
- lea coeffq, [ coeffq+ncoeffq*2]
- lea qcoeffq, [ qcoeffq+ncoeffq*2]
- lea dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
- lea iscanq, [ iscanq+ncoeffq*2]
- neg ncoeffq
-
- ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
- ; coeff stored as 32bit numbers & require 16bit numbers
- mova m9, [coeffq+ncoeffq*4+ 0]
- packssdw m9, [coeffq+ncoeffq*4+16]
- mova m10, [coeffq+ncoeffq*4+32]
- packssdw m10, [coeffq+ncoeffq*4+48]
-%else
- mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
- mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- punpckhqdq m0, m0
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-
- ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
- por m14, m7, m12
- ptest m14, m14
- jnz .first_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
- mova [qcoeffq+ncoeffq*4 ], ymm5
- mova [qcoeffq+ncoeffq*4+32], ymm5
- mova [dqcoeffq+ncoeffq*4 ], ymm5
- mova [dqcoeffq+ncoeffq*4+32], ymm5
-%else
- mova [qcoeffq+ncoeffq*2], ymm5
- mova [dqcoeffq+ncoeffq*2], ymm5
-%endif
-
- add ncoeffq, mmsize
-
- punpckhqdq m1, m1
- punpckhqdq m2, m2
- punpckhqdq m3, m3
- punpckhqdq m4, m4
- pxor m8, m8
-
- jmp .ac_only_loop
-
-.first_nonzero:
-
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m8, m6 ; m8 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m8, m4 ; m8 = m8*qsh>>16
- punpckhqdq m4, m4
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m8, m7
- pand m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
-%else
- mova [qcoeffq+ncoeffq*2+ 0], m8
- mova [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
- pabsw m8, m8
- pabsw m13, m13
-%endif
- pmullw m8, m3 ; dqc[i] = qc[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m8, 1
- psrlw m13, 1
- psignw m8, m9
- psignw m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
-%else
- mova [dqcoeffq+ncoeffq*2+ 0], m8
- mova [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
- pcmpeqw m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i]
- mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m8, m6 ; m8 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m13
- add ncoeffq, mmsize
-
-.ac_only_loop:
-
-%if CONFIG_VP9_HIGHBITDEPTH
- ; pack coeff from 32bit to 16bit array
- mova m9, [coeffq+ncoeffq*4+ 0]
- packssdw m9, [coeffq+ncoeffq*4+16]
- mova m10, [coeffq+ncoeffq*4+32]
- packssdw m10, [coeffq+ncoeffq*4+48]
-%else
- mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
- mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-
- ; Check if all coeffs are less than zbin. If yes, skip this itertion.
- ; And just write zeros as the result would be.
- por m14, m7, m12
- ptest m14, m14
- jnz .rest_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
- mova [qcoeffq+ncoeffq*4+ 0], ymm5
- mova [qcoeffq+ncoeffq*4+32], ymm5
- mova [dqcoeffq+ncoeffq*4+ 0], ymm5
- mova [dqcoeffq+ncoeffq*4+32], ymm5
-%else
- mova [qcoeffq+ncoeffq*2+ 0], ymm5
- mova [dqcoeffq+ncoeffq*2+ 0], ymm5
-%endif
- add ncoeffq, mmsize
- jnz .ac_only_loop
-
- ; Horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- movq rax, m8
- mov [r2], ax
- vzeroupper
- RET
-
-.rest_nonzero:
- paddsw m6, m1 ; m6 += round
- paddsw m11, m1 ; m11 += round
- pmulhw m14, m6, m2 ; m14 = m6*q>>16
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m14, m6 ; m14 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m14, m4 ; m14 = m14*qsh>>16
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m14, m9 ; m14 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m14, m7
- pand m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m14
- punpckhwd m6, m14, m6
- pmovsxwd m11, m14
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
-%else
- mova [qcoeffq+ncoeffq*2+ 0], m14
- mova [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
- pabsw m14, m14
- pabsw m13, m13
-%endif
- pmullw m14, m3 ; dqc[i] = qc[i] * q
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m14, 1
- psrlw m13, 1
- psignw m14, m9
- psignw m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m14
- punpckhwd m6, m14, m6
- pmovsxwd m11, m14
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
-%else
- mova [dqcoeffq+ncoeffq*2+ 0], m14
- mova [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
- pcmpeqw m14, m5 ; m14 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m14, m6 ; m14 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m14
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jnz .ac_only_loop
-
- ; Horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- movq rax, m8
- mov [r2], ax
- vzeroupper
- RET
-
- ; Skip-block, i.e. just write all zeroes
-.blank:
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
- qcoeff, dqcoeff, dequant, eob, scan, iscan
-
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
-
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
- lea dqcoeffq, [dqcoeffq+ncoeffq*2]
- lea qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
- neg ncoeffq
- pxor m7, m7
-
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova [dqcoeffq+ncoeffq*4+ 0], ymm7
- mova [dqcoeffq+ncoeffq*4+32], ymm7
- mova [qcoeffq+ncoeffq*4+ 0], ymm7
- mova [qcoeffq+ncoeffq*4+32], ymm7
-%else
- mova [dqcoeffq+ncoeffq*2+ 0], ymm7
- mova [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
- add ncoeffq, mmsize
- jl .blank_loop
-
- mov [eobq], word 0
-
- vzeroupper
- RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
-END
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index 32721beb3..c020b398c 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -8,12 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,
@@ -22,202 +24,103 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan_ptr,
const int16_t *iscan_ptr) {
- __m128i zero;
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
(void)scan_ptr;
+ (void)skip_block;
+ assert(!skip_block);
+
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
- coeff_ptr += n_coeffs;
- iscan_ptr += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
- if (!skip_block) {
- __m128i eob;
- __m128i zbin;
- __m128i round, quant, dequant, shift;
- {
- __m128i coeff0, coeff1;
-
- // Setup global values
- {
- __m128i pw_1;
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- pw_1 = _mm_set1_epi16(1);
- zbin = _mm_sub_epi16(zbin, pw_1);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- }
-
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
- // Do DC and first 15 AC
- coeff0 = load_tran_low(coeff_ptr + n_coeffs);
- coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
- qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
- qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
- shift = _mm_unpackhi_epi64(shift, shift);
- qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
- store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
- store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
- store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
-
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
-
- coeff0 = load_tran_low(coeff_ptr + n_coeffs);
- coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
- qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
- qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
- qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
- store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
- store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
- store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
- }
-
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- store_tran_low(zero, dqcoeff_ptr + n_coeffs);
- store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
- store_tran_low(zero, qcoeff_ptr + n_coeffs);
- store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
+ store_tran_low(coeff0, dqcoeff_ptr);
+ store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_tran_low(coeff0, dqcoeff_ptr + index);
+ store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
}
+
+ *eob_ptr = accumulate_eob(eob);
}
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c
new file mode 100644
index 000000000..3f528e1a9
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan_ptr;
+ (void)skip_block;
+ assert(!skip_block);
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_tran_low(coeff0, dqcoeff_ptr);
+ store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_tran_low(coeff0, dqcoeff_ptr + index);
+ store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_ssse3(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan_ptr;
+ (void)n_coeffs;
+ (void)skip_block;
+ assert(!skip_block);
+
+ // Setup global values.
+ // The 32x32 halves zbin and round.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, one);
+ zbin = _mm_srli_epi16(zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ zbin = _mm_sub_epi16(zbin, one);
+
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ round = _mm_add_epi16(round, one);
+ round = _mm_srli_epi16(round, 1);
+
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+ // I suspect this is not technically OK because quant_shift can be up
+ // to 1 << 16 and shifting up again will outrange that, but the test is not
+ // comprehensive enough to catch that and "it's been that way forever"
+ shift = _mm_slli_epi16(shift, 1);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+#endif // CONFIG_HIGHBITDEPTH
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ // Un-sign to bias rounding like C.
+ // dequant is almost always negative, so this is probably the backwards way
+ // to handle the sign. However, it matches the previous assembly.
+ coeff0 = _mm_abs_epi16(qcoeff0);
+ coeff1 = _mm_abs_epi16(qcoeff1);
+
+ coeff0 = calculate_dqcoeff(coeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+ // "Divide" by 2.
+ coeff0 = _mm_srli_epi16(coeff0, 1);
+ coeff1 = _mm_srli_epi16(coeff1, 1);
+
+ coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+ coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+ store_tran_low(coeff0, dqcoeff_ptr);
+ store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+ zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = _mm_abs_epi16(qcoeff0);
+ coeff1 = _mm_abs_epi16(qcoeff1);
+
+ coeff0 = calculate_dqcoeff(coeff0, dequant);
+ coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+ coeff0 = _mm_srli_epi16(coeff0, 1);
+ coeff1 = _mm_srli_epi16(coeff1, 1);
+
+ coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+ coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+ store_tran_low(coeff0, dqcoeff_ptr + index);
+ store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index ec2cafb94..000000000
--- a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,345 +0,0 @@
-;
-; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, \
- eob, scan, iscan
- cmp dword skipm, 0
- jne .blank
-
- ; actual quantize loop - setup pointers, rounders, etc.
- movifnidn coeffq, coeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, dequantmp
- movifnidn zbinq, zbinmp
- movifnidn roundq, roundmp
- movifnidn quantq, quantmp
- mova m0, [zbinq] ; m0 = zbin
- mova m1, [roundq] ; m1 = round
- mova m2, [quantq] ; m2 = quant
-%ifidn %1, b_32x32
- pcmpeqw m5, m5
- psrlw m5, 15
- paddw m0, m5
- paddw m1, m5
- psrlw m0, 1 ; m0 = (m0 + 1) / 2
- psrlw m1, 1 ; m1 = (m1 + 1) / 2
-%endif
- mova m3, [r2q] ; m3 = dequant
- psubw m0, [pw_1]
- mov r2, shiftmp
- mov r3, qcoeffmp
- mova m4, [r2] ; m4 = shift
- mov r4, dqcoeffmp
- mov r5, iscanmp
-%ifidn %1, b_32x32
- psllw m4, 1
-%endif
- pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-%if CONFIG_VP9_HIGHBITDEPTH
- lea coeffq, [ coeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
- lea coeffq, [ coeffq+ncoeffq*2]
- lea qcoeffq, [ qcoeffq+ncoeffq*2]
- lea dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
- lea iscanq, [ iscanq+ncoeffq*2]
- neg ncoeffq
-
- ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
- ; coeff stored as 32bit numbers & require 16bit numbers
- mova m9, [ coeffq+ncoeffq*4+ 0]
- packssdw m9, [ coeffq+ncoeffq*4+16]
- mova m10, [ coeffq+ncoeffq*4+32]
- packssdw m10, [ coeffq+ncoeffq*4+48]
-%else
- mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
- mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- punpckhqdq m0, m0
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m8, m6 ; m8 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m8, m4 ; m8 = m8*qsh>>16
- punpckhqdq m4, m4
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m8, m7
- pand m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- mova m11, m8
- mova m6, m8
- pcmpgtw m5, m8
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
- pxor m5, m5 ; reset m5 to zero register
-%else
- mova [qcoeffq+ncoeffq*2+ 0], m8
- mova [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
- pabsw m8, m8
- pabsw m13, m13
-%endif
- pmullw m8, m3 ; dqc[i] = qc[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m8, 1
- psrlw m13, 1
- psignw m8, m9
- psignw m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- mova m11, m8
- mova m6, m8
- pcmpgtw m5, m8
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
- pxor m5, m5 ; reset m5 to zero register
-%else
- mova [dqcoeffq+ncoeffq*2+ 0], m8
- mova [dqcoeffq+ncoeffq*2+16], m13
-%endif
- pcmpeqw m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m8, m6 ; m8 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jz .accumulate_eob
-
-.ac_only_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
- ; pack coeff from 32bit to 16bit array
- mova m9, [ coeffq+ncoeffq*4+ 0]
- packssdw m9, [ coeffq+ncoeffq*4+16]
- mova m10, [ coeffq+ncoeffq*4+32]
- packssdw m10, [ coeffq+ncoeffq*4+48]
-%else
- mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
- mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
- pmovmskb r6d, m7
- pmovmskb r2d, m12
- or r6, r2
- jz .skip_iter
-%endif
- paddsw m6, m1 ; m6 += round
- paddsw m11, m1 ; m11 += round
- pmulhw m14, m6, m2 ; m14 = m6*q>>16
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m14, m6 ; m14 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m14, m4 ; m14 = m14*qsh>>16
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m14, m9 ; m14 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m14, m7
- pand m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- mova m11, m14
- mova m6, m14
- pcmpgtw m5, m14
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
- pxor m5, m5 ; reset m5 to zero register
-%else
- mova [qcoeffq+ncoeffq*2+ 0], m14
- mova [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
- pabsw m14, m14
- pabsw m13, m13
-%endif
- pmullw m14, m3 ; dqc[i] = qc[i] * q
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m14, 1
- psrlw m13, 1
- psignw m14, m9
- psignw m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- mova m11, m14
- mova m6, m14
- pcmpgtw m5, m14
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
- pxor m5, m5
-%else
- mova [dqcoeffq+ncoeffq*2+ 0], m14
- mova [dqcoeffq+ncoeffq*2+16], m13
-%endif
- pcmpeqw m14, m5 ; m14 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m14, m6 ; m14 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m14
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jl .ac_only_loop
-
-%ifidn %1, b_32x32
- jmp .accumulate_eob
-.skip_iter:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova [qcoeffq+ncoeffq*4+ 0], m5
- mova [qcoeffq+ncoeffq*4+16], m5
- mova [qcoeffq+ncoeffq*4+32], m5
- mova [qcoeffq+ncoeffq*4+48], m5
- mova [dqcoeffq+ncoeffq*4+ 0], m5
- mova [dqcoeffq+ncoeffq*4+16], m5
- mova [dqcoeffq+ncoeffq*4+32], m5
- mova [dqcoeffq+ncoeffq*4+48], m5
-%else
- mova [qcoeffq+ncoeffq*2+ 0], m5
- mova [qcoeffq+ncoeffq*2+16], m5
- mova [dqcoeffq+ncoeffq*2+ 0], m5
- mova [dqcoeffq+ncoeffq*2+16], m5
-%endif
- add ncoeffq, mmsize
- jl .ac_only_loop
-%endif
-
-.accumulate_eob:
- ; horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- pextrw r6, m8, 0
- mov [r2], r6
- RET
-
- ; skip-block, i.e. just write all zeroes
-.blank:
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
- DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_VP9_HIGHBITDEPTH
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
- lea dqcoeffq, [dqcoeffq+ncoeffq*2]
- lea qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
- neg ncoeffq
- pxor m7, m7
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova [dqcoeffq+ncoeffq*4+ 0], m7
- mova [dqcoeffq+ncoeffq*4+16], m7
- mova [dqcoeffq+ncoeffq*4+32], m7
- mova [dqcoeffq+ncoeffq*4+48], m7
- mova [qcoeffq+ncoeffq*4+ 0], m7
- mova [qcoeffq+ncoeffq*4+16], m7
- mova [qcoeffq+ncoeffq*4+32], m7
- mova [qcoeffq+ncoeffq*4+48], m7
-%else
- mova [dqcoeffq+ncoeffq*2+ 0], m7
- mova [dqcoeffq+ncoeffq*2+16], m7
- mova [qcoeffq+ncoeffq*2+ 0], m7
- mova [qcoeffq+ncoeffq*2+16], m7
-%endif
- add ncoeffq, mmsize
- jl .blank_loop
- mov word [eobq], 0
- RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
diff --git a/libvpx/vpx_dsp/x86/quantize_x86.h b/libvpx/vpx_dsp/x86/quantize_x86.h
new file mode 100644
index 000000000..34928fbb5
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_x86.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+ const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr, __m128i *dequant,
+ const int16_t *shift_ptr, __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+ const __m128i quant, const __m128i shift) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+ return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+ const __m128i zbin_mask0,
+ const __m128i zbin_mask1,
+ const int16_t *scan_ptr, const int index,
+ const __m128i zero) {
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+ __m128i eob0, eob1;
+ // Add one to convert from indices to counts
+ scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+ scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+ eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+ return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libvpx/vpx_dsp/x86/sad4d_avx512.c
new file mode 100644
index 000000000..5f2ab6ea7
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX512
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4]) {
+ __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ __m512i sum_mlow, sum_mhigh;
+ int i;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+ sum_ref0 = _mm512_set1_epi16(0);
+ sum_ref1 = _mm512_set1_epi16(0);
+ sum_ref2 = _mm512_set1_epi16(0);
+ sum_ref3 = _mm512_set1_epi16(0);
+ for (i = 0; i < 64; i++) {
+ // load src and all refs
+ src_reg = _mm512_loadu_si512((const __m512i *)src);
+ ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
+ ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
+ ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
+ ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
+ // sum every ref-i
+ sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+ {
+ __m256i sum256;
+ __m128i sum128;
+ // in sum_ref-i the result is saved in the first 4 bytes
+ // the other 4 bytes are zeroed.
+ // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+ sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
+ sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4);
+
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
+ sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
+
+ // merge every 64 bit from each sum_ref-i
+ sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
+ sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
+
+ // add the low 64 bit to the high 64 bit
+ sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh);
+
+ // add the low 128 bit to the high 128 bit
+ sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow),
+ _mm512_extracti32x8_epi32(sum_mlow, 1));
+ sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
+ _mm256_extractf128_si256(sum256, 1));
+
+ _mm_storeu_si128((__m128i *)(res), sum128);
+ }
+}
diff --git a/libvpx/vpx_dsp/x86/sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm
index 18279bdb9..175dcc089 100644
--- a/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ b/libvpx/vpx_dsp/x86/sad_sse3.asm
@@ -165,6 +165,8 @@
paddw mm7, mm3
%endmacro
+SECTION .text
+
;void int vpx_sad16x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
diff --git a/libvpx/vpx_dsp/x86/sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm
index bc6744797..03999dfca 100644
--- a/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ b/libvpx/vpx_dsp/x86/sad_sse4.asm
@@ -165,6 +165,8 @@
movdqa [rdi + 16], xmm2
%endmacro
+SECTION .text
+
;void vpx_sad16x16x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
diff --git a/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm
index 49f204fa0..7cf93cf51 100644
--- a/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/sad_ssse3.asm
@@ -146,6 +146,8 @@
%endmacro
+SECTION .text
+
;void int vpx_sad16x16x3_ssse3(
; unsigned char *src_ptr,
; int src_stride,
diff --git a/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
index 6d58321e0..300fa8aab 100644
--- a/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
+++ b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -44,6 +44,9 @@
paddd %1, xmm1
SUM_ACROSS_Q %1
%endmacro
+
+SECTION .text
+
;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
diff --git a/libvpx/vpx_dsp/x86/transpose_sse2.h b/libvpx/vpx_dsp/x86/transpose_sse2.h
index a5e40245a..8a0119ca7 100644
--- a/libvpx/vpx_dsp/x86/transpose_sse2.h
+++ b/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -11,45 +11,357 @@
#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include <emmintrin.h> // SSE2
-static INLINE void transpose_16bit_4x4(__m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+#include "./vpx_config.h"
- res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
- res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // Unpack 16 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // Unpack 32 bit elements resulting in:
+ // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+ const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+ const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+ const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(c0, c0);
+ out[1] = _mm_unpackhi_epi64(c0, c0);
+ out[2] = _mm_unpacklo_epi64(c1, c1);
+ out[3] = _mm_unpackhi_epi64(c1, c1);
+ out[4] = _mm_unpacklo_epi64(c2, c2);
+ out[5] = _mm_unpackhi_epi64(c2, c2);
+ out[6] = _mm_unpacklo_epi64(c3, c3);
+ out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 02 12 22 32 03 13 23 33
+ out[0] = _mm_unpacklo_epi32(a0, a1);
+ out[1] = _mm_unpackhi_epi32(a0, a1);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+ __m128i *const right) {
+ __m128i tbuf[8];
+ transpose_16bit_8x8(left, left);
+ transpose_16bit_8x8(right, tbuf);
+ transpose_16bit_8x8(left + 8, right);
+ transpose_16bit_8x8(right + 8, right + 8);
+
+ left[8] = tbuf[0];
+ left[9] = tbuf[1];
+ left[10] = tbuf[2];
+ left[11] = tbuf[3];
+ left[12] = tbuf[4];
+ left[13] = tbuf[5];
+ left[14] = tbuf[6];
+ left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
}
-static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
- __m128i *const a2, __m128i *const a3) {
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+ __m128i *const out) {
// Unpack 32 bit elements. Goes from:
- // a0: 00 01 02 03
- // a1: 10 11 12 13
- // a2: 20 21 22 23
- // a3: 30 31 32 33
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // in[4]: 04 05 06 07
+ // in[5]: 14 15 16 17
+ // in[6]: 24 25 26 27
+ // in[7]: 34 35 36 37
// to:
- // b0: 00 10 01 11
- // b1: 20 30 21 31
- // b2: 02 12 03 13
- // b3: 22 32 23 33
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
- const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
- const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
- const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
- const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 04 05 06 07
+ // in[2]: 10 11 12 13
+ // in[3]: 14 15 16 17
+ // in[4]: 20 21 22 23
+ // in[5]: 24 25 26 27
+ // in[6]: 30 31 32 33
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
// Unpack 64 bit elements resulting in:
- // a0: 00 10 20 30
- // a1: 01 11 21 31
- // a2: 02 12 22 32
- // a3: 03 13 23 33
- *a0 = _mm_unpacklo_epi64(b0, b1);
- *a1 = _mm_unpackhi_epi64(b0, b1);
- *a2 = _mm_unpacklo_epi64(b2, b3);
- *a3 = _mm_unpackhi_epi64(b2, b3);
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
}
#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libvpx/vpx_dsp/x86/txfm_common_sse2.h
index f8edb1b78..0a9542c85 100644
--- a/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ b/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -18,6 +18,9 @@
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+#define pair_set_epi32(a, b) \
+ _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
#define dual_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
(int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c
index 8428e0520..d15a89c74 100644
--- a/libvpx/vpx_dsp/x86/variance_avx2.c
+++ b/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -7,16 +7,592 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+
+#include <immintrin.h> // AVX2
+
#include "./vpx_dsp_rtcd.h"
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+
+DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
+ 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
+ 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1
+};
+/* clang-format on */
+
+void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *sse, int *sum) {
+ unsigned int i, src_2strides, ref_2strides;
+ __m256i sum_reg = _mm256_setzero_si256();
+ __m256i sse_reg = _mm256_setzero_si256();
+ // process two 16 byte locations in a 256 bit register
+ src_2strides = source_stride << 1;
+ ref_2strides = recon_stride << 1;
+ for (i = 0; i < 8; ++i) {
+ // convert up values in 128 bit registers across lanes
+ const __m256i src0 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(src_ptr)));
+ const __m256i src1 = _mm256_cvtepu8_epi16(
+ _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)));
+ const __m256i ref0 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(ref_ptr)));
+ const __m256i ref1 = _mm256_cvtepu8_epi16(
+ _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)));
+ const __m256i diff0 = _mm256_sub_epi16(src0, ref0);
+ const __m256i diff1 = _mm256_sub_epi16(src1, ref1);
+ const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+ // add to the running totals
+ sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
+ sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
+
+ src_ptr += src_2strides;
+ ref_ptr += ref_2strides;
+ }
+ {
+ // extract the low lane and add it to the high lane
+ const __m128i sum_reg_128 = _mm_add_epi16(
+ _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
+ const __m128i sse_reg_128 = _mm_add_epi32(
+ _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
+
+ // sum upper and lower 64 bits together and convert up to 32 bit values
+ const __m128i sum_reg_64 =
+ _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+ const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+ // unpack sse and sum registers and add
+ const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
+ const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
+ const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+ // perform the final summation and extract the results
+ const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+ *((int *)sse) = _mm_cvtsi128_si32(res);
+ *((int *)sum) = _mm_extract_epi32(res, 1);
+ }
+}
+
+static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride,
+ const unsigned char *ref_ptr, int recon_stride,
+ unsigned int *sse, int *sum) {
+ unsigned int i, src_2strides, ref_2strides;
+ const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
+ __m256i sum_reg = _mm256_setzero_si256();
+ __m256i sse_reg = _mm256_setzero_si256();
+
+ // process 64 elements in an iteration
+ src_2strides = source_stride << 1;
+ ref_2strides = recon_stride << 1;
+ for (i = 0; i < 8; i++) {
+ const __m256i src0 = _mm256_loadu_si256((__m256i const *)(src_ptr));
+ const __m256i src1 =
+ _mm256_loadu_si256((__m256i const *)(src_ptr + source_stride));
+ const __m256i ref0 = _mm256_loadu_si256((__m256i const *)(ref_ptr));
+ const __m256i ref1 =
+ _mm256_loadu_si256((__m256i const *)(ref_ptr + recon_stride));
+
+ // unpack into pairs of source and reference values
+ const __m256i src_ref0 = _mm256_unpacklo_epi8(src0, ref0);
+ const __m256i src_ref1 = _mm256_unpackhi_epi8(src0, ref0);
+ const __m256i src_ref2 = _mm256_unpacklo_epi8(src1, ref1);
+ const __m256i src_ref3 = _mm256_unpackhi_epi8(src1, ref1);
+
+ // subtract adjacent elements using src*1 + ref*-1
+ const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+ const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+ const __m256i diff2 = _mm256_maddubs_epi16(src_ref2, adj_sub);
+ const __m256i diff3 = _mm256_maddubs_epi16(src_ref3, adj_sub);
+ const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+ const __m256i madd2 = _mm256_madd_epi16(diff2, diff2);
+ const __m256i madd3 = _mm256_madd_epi16(diff3, diff3);
+
+ // add to the running totals
+ sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
+ sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff2, diff3));
+ sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
+ sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd2, madd3));
+
+ src_ptr += src_2strides;
+ ref_ptr += ref_2strides;
+ }
+
+ {
+ // extract the low lane and add it to the high lane
+ const __m128i sum_reg_128 = _mm_add_epi16(
+ _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
+ const __m128i sse_reg_128 = _mm_add_epi32(
+ _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
+
+ // sum upper and lower 64 bits together and convert up to 32 bit values
+ const __m128i sum_reg_64 =
+ _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+ const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+ // unpack sse and sum registers and add
+ const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
+ const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
+ const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+ // perform the final summation and extract the results
+ const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+ *((int *)sse) = _mm_cvtsi128_si32(res);
+ *((int *)sum) = _mm_extract_epi32(res, 1);
+ }
+}
+
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+ *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+ /* calculate sse */ \
+ *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \
+ *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+ const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ sec += sec_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+// (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction.
+static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int sstep) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+ const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ sec += sec_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg) {
+ spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, sum_reg, sse_reg, src_stride);
+}
+
+static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg) {
+ spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, sum_reg, sse_reg, 1);
+}
+
+static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg);
+ prev_src_avg = src_avg;
+
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+ const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ sec += sec_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
+ }
+ // save current source average
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+// (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction.
+static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int offset, int sstep) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (offset << 5)));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(filter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+ const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+ sec += sec_stride;
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg,
+ int y_offset) {
+ spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, sum_reg, sse_reg, y_offset, src_stride);
+}
+
+static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg,
+ int x_offset) {
+ spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, sum_reg, sse_reg, x_offset, 1);
+}
+
+static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg,
+ int y_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg);
+ exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg);
+ prev_src_avg = src_avg;
+
+ FILTER_SRC(filter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+ const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ sec += sec_stride;
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg,
+ int x_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i src_reg, src_pack;
+ int i;
+ exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+ exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(filter)
+
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+ const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
+ sec += sec_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src_pack = src_reg;
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride, int do_sec,
+ int height, __m256i *sum_reg, __m256i *sse_reg,
+ int x_offset, int y_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+ const __m256i yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i prev_src_pack, src_pack;
+ int i;
+ exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+ exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src += src_stride;
+
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(xfilter)
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+ // merge previous pack to current pack source
+ exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack);
+ exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack);
+
+ FILTER_SRC(yfilter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+ const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ sec += sec_stride;
+ }
+
+ prev_src_pack = src_pack;
+
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *sec, int sec_stride,
+ int do_sec, int height, unsigned int *sse) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i sum_reg = _mm256_setzero_si256();
+ __m256i sse_reg = _mm256_setzero_si256();
+ __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ int sum;
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg);
+ // x_offset = 0 and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg);
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg, y_offset);
+ }
+ // x_offset = 4 and y_offset = 0
+ } else if (x_offset == 4) {
+ if (y_offset == 0) {
+ spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg);
+ // x_offset = 4 and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg);
+ // x_offset = 4 and y_offset = bilin interpolation
+ } else {
+ spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg, y_offset);
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg, x_offset);
+ // x_offset = bilin interpolation and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg, x_offset);
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+ height, &sum_reg, &sse_reg, x_offset, y_offset);
+ }
+ }
+ CALC_SUM_AND_SSE
+ return sum;
+}
+
+static unsigned int sub_pixel_variance32xh_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+ return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+ NULL, 0, 0, height, sse);
+}
+
+static unsigned int sub_pixel_avg_variance32xh_avx2(
+ const uint8_t *src, int src_stride, int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+ int height, unsigned int *sse) {
+ return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+ sec, sec_stride, 1, height, sse);
+}
+
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum);
-void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride, unsigned int *sse,
- int *sum);
-
static void variance_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride, int w, int h,
unsigned int *sse, int *sum, get_var_avx2 var_fn,
@@ -44,7 +620,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
vpx_get16x16var_avx2, 16);
- return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
}
unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
@@ -60,7 +636,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
- vpx_get32x32var_avx2, 32);
+ get32x16var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
}
@@ -69,7 +645,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
- vpx_get32x32var_avx2, 32);
+ get32x16var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
}
@@ -78,7 +654,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
- vpx_get32x32var_avx2, 32);
+ get32x16var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
}
@@ -87,32 +663,22 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
- vpx_get32x32var_avx2, 32);
+ get32x16var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
}
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
- int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride,
- int height, unsigned int *sse);
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
- int height, unsigned int *sseptr);
-
unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
int src_stride, int x_offset,
int y_offset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
unsigned int sse1;
- const int se1 = vpx_sub_pixel_variance32xh_avx2(
+ const int se1 = sub_pixel_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
unsigned int sse2;
const int se2 =
- vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
- dst + 32, dst_stride, 64, &sse2);
+ sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
+ dst + 32, dst_stride, 64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
return *sse - (uint32_t)(((int64_t)se * se) >> 12);
@@ -123,7 +689,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
int y_offset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
- const int se = vpx_sub_pixel_variance32xh_avx2(
+ const int se = sub_pixel_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
return *sse - (uint32_t)(((int64_t)se * se) >> 10);
}
@@ -132,10 +698,10 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
const uint8_t *src, int src_stride, int x_offset, int y_offset,
const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
unsigned int sse1;
- const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(
+ const int se1 = sub_pixel_avg_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
unsigned int sse2;
- const int se2 = vpx_sub_pixel_avg_variance32xh_avx2(
+ const int se2 = sub_pixel_avg_variance32xh_avx2(
src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
64, 64, &sse2);
const int se = se1 + se2;
@@ -149,7 +715,7 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
const uint8_t *src, int src_stride, int x_offset, int y_offset,
const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
// Process 32 elements in parallel.
- const int se = vpx_sub_pixel_avg_variance32xh_avx2(
+ const int se = sub_pixel_avg_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
return *sse - (uint32_t)(((int64_t)se * se) >> 10);
}
diff --git a/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/libvpx/vpx_dsp/x86/variance_impl_avx2.c
deleted file mode 100644
index 51e6b19ad..000000000
--- a/libvpx/vpx_dsp/x86/variance_impl_avx2.c
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h> // AVX2
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-
-/* clang-format off */
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-};
-/* clang-format on */
-
-void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
- const unsigned char *ref_ptr, int recon_stride,
- unsigned int *SSE, int *Sum) {
- __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
- __m256i ref_expand_high, madd_low, madd_high;
- unsigned int i, src_2strides, ref_2strides;
- __m256i zero_reg = _mm256_set1_epi16(0);
- __m256i sum_ref_src = _mm256_set1_epi16(0);
- __m256i madd_ref_src = _mm256_set1_epi16(0);
-
- // processing two strides in a 256 bit register reducing the number
- // of loop stride by half (comparing to the sse2 code)
- src_2strides = source_stride << 1;
- ref_2strides = recon_stride << 1;
- for (i = 0; i < 8; i++) {
- src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
- src = _mm256_inserti128_si256(
- src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
-
- ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
- ref = _mm256_inserti128_si256(
- ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
-
- // expanding to 16 bit each lane
- src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
- src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
- ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
- ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
- // src-ref
- src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
- src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
- // madd low (src - ref)
- madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
- // add high to low
- src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
- // madd high (src - ref)
- madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
- sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
- // add high to low
- madd_ref_src =
- _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
- src_ptr += src_2strides;
- ref_ptr += ref_2strides;
- }
-
- {
- __m128i sum_res, madd_res;
- __m128i expand_sum_low, expand_sum_high, expand_sum;
- __m128i expand_madd_low, expand_madd_high, expand_madd;
- __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
- // extract the low lane and add it to the high lane
- sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
- _mm256_extractf128_si256(sum_ref_src, 1));
-
- madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
- _mm256_extractf128_si256(madd_ref_src, 1));
-
- // padding each 2 bytes with another 2 zeroed bytes
- expand_sum_low =
- _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
- expand_sum_high =
- _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-
- // shifting the sign 16 bits right
- expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
- expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
- expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
- // expand each 32 bits of the madd result to 64 bits
- expand_madd_low =
- _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
- expand_madd_high =
- _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-
- expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
- ex_expand_sum_low =
- _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
- ex_expand_sum_high =
- _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-
- ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
- // shift 8 bytes eight
- madd_res = _mm_srli_si128(expand_madd, 8);
- sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
- madd_res = _mm_add_epi32(madd_res, expand_madd);
- sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
- *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
-
- *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
- }
-}
-
-void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
- const unsigned char *ref_ptr, int recon_stride,
- unsigned int *SSE, int *Sum) {
- __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
- __m256i ref_expand_high, madd_low, madd_high;
- unsigned int i;
- __m256i zero_reg = _mm256_set1_epi16(0);
- __m256i sum_ref_src = _mm256_set1_epi16(0);
- __m256i madd_ref_src = _mm256_set1_epi16(0);
-
- // processing 32 elements in parallel
- for (i = 0; i < 16; i++) {
- src = _mm256_loadu_si256((__m256i const *)(src_ptr));
-
- ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-
- // expanding to 16 bit each lane
- src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
- src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
- ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
- ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
- // src-ref
- src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
- src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
- // madd low (src - ref)
- madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
- // add high to low
- src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
- // madd high (src - ref)
- madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
- sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
- // add high to low
- madd_ref_src =
- _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
- src_ptr += source_stride;
- ref_ptr += recon_stride;
- }
-
- {
- __m256i expand_sum_low, expand_sum_high, expand_sum;
- __m256i expand_madd_low, expand_madd_high, expand_madd;
- __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
- // padding each 2 bytes with another 2 zeroed bytes
- expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
- expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
- // shifting the sign 16 bits right
- expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
- expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
- expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
- // expand each 32 bits of the madd result to 64 bits
- expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
- expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
- expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
- ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
- ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
- ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
- // shift 8 bytes eight
- madd_ref_src = _mm256_srli_si256(expand_madd, 8);
- sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
- madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
- sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
- // extract the low lane and the high lane and add the results
- *((int *)SSE) =
- _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
- _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
- *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
- }
-}
-
-#define FILTER_SRC(filter) \
- /* filter the source */ \
- exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
- exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
- \
- /* add 8 to source */ \
- exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
- exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
- \
- /* divide source by 16 */ \
- exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
- exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg) \
- exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
- exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST \
- /* load source and destination */ \
- src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
- dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
- /* average between current and next stride source */ \
- src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
- MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP \
- /* expand each byte to 2 bytes */ \
- exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
- exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
- /* source - dest */ \
- exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
- exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
- /* caculate sum */ \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
- exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
- exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
- /* calculate sse */ \
- sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
- sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE \
- res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
- sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
- sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
- \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
- \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
- int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride,
- int height, unsigned int *sse) {
- __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
- __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
- __m256i zero_reg;
- int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
-
- // x_offset = 0 and y_offset = 0
- if (x_offset == 0) {
- if (y_offset == 0) {
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 0 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, src_stride)
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 0 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg;
-
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, src_stride)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- }
- // x_offset = 8 and y_offset = 0
- } else if (x_offset == 8) {
- if (y_offset == 0) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg, src_avg;
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // average between previous average to current average
- src_avg = _mm256_avg_epu8(src_avg, src_reg);
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- // save current source average
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg, src_avg;
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- // save current source average
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- MERGE_WITH_SRC(src_avg, src_reg)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- // x_offset = bilin interpolation and y_offset = 0
- } else {
- if (y_offset == 0) {
- __m256i filter, pw8, src_next_reg;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = 8
- } else if (y_offset == 8) {
- __m256i filter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // average between previous pack to the current
- src_pack = _mm256_avg_epu8(src_pack, src_reg);
- MERGE_WITH_SRC(src_pack, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src_pack = src_reg;
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = bilin interpolation
- } else {
- __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- xfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- y_offset <<= 5;
- yfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
-
- FILTER_SRC(xfilter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(xfilter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // merge previous pack to current pack source
- MERGE_WITH_SRC(src_pack, src_reg)
- // filter the source
- FILTER_SRC(yfilter)
- src_pack = src_reg;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- }
- CALC_SUM_AND_SSE
- return sum;
-}
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
- int height, unsigned int *sse) {
- __m256i sec_reg;
- __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
- __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
- __m256i zero_reg;
- int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
-
- // x_offset = 0 and y_offset = 0
- if (x_offset == 0) {
- if (y_offset == 0) {
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- } else if (y_offset == 8) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, src_stride)
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 0 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg;
-
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, src_stride)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- }
- // x_offset = 8 and y_offset = 0
- } else if (x_offset == 8) {
- if (y_offset == 0) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg, src_avg;
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- // save current source average
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // average between previous average to current average
- src_avg = _mm256_avg_epu8(src_avg, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_avg = _mm256_avg_epu8(src_avg, sec_reg);
- sec += sec_stride;
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg, src_avg;
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- // save current source average
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- MERGE_WITH_SRC(src_avg, src_reg)
- FILTER_SRC(filter)
- src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_avg = _mm256_avg_epu8(src_avg, sec_reg);
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- sec += sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- // x_offset = bilin interpolation and y_offset = 0
- } else {
- if (y_offset == 0) {
- __m256i filter, pw8, src_next_reg;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- MERGE_WITH_SRC(src_reg, zero_reg)
- sec += sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = 8
- } else if (y_offset == 8) {
- __m256i filter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // average between previous pack to the current
- src_pack = _mm256_avg_epu8(src_pack, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_pack = _mm256_avg_epu8(src_pack, sec_reg);
- sec += sec_stride;
- MERGE_WITH_SRC(src_pack, zero_reg)
- src_pack = src_reg;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = bilin interpolation
- } else {
- __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- xfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- y_offset <<= 5;
- yfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
-
- FILTER_SRC(xfilter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(xfilter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // merge previous pack to current pack source
- MERGE_WITH_SRC(src_pack, src_reg)
- // filter the source
- FILTER_SRC(yfilter)
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_pack = _mm256_avg_epu8(src_pack, sec_reg);
- MERGE_WITH_SRC(src_pack, zero_reg)
- src_pack = src_reg;
- sec += sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- }
- CALC_SUM_AND_SSE
- return sum;
-}
diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
index 1161da491..8d8bf183b 100644
--- a/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -222,7 +222,7 @@ unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
unsigned int *sse) {
int sum;
vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
}
unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
diff --git a/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
index 727d9d115..4f164afeb 100644
--- a/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
@@ -41,38 +41,38 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, sse2);
FUN_CONV_2D(avg_, sse2);
@@ -140,22 +140,22 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
sse2);
// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h, int bd);
// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h, int bd);
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h, int bd);
HIGH_FUN_CONV_2D(, sse2);
HIGH_FUN_CONV_2D(avg_, sse2);
#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
diff --git a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index 389a692db..3f444e2e6 100644
--- a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -20,14 +20,14 @@ SECTION .text
%endif
%ifidn %2, highbd
%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
dst, dst_stride, \
- fx, fxs, fy, fys, w, h, bd
+ f, fxo, fxs, fyo, fys, w, h, bd
%else
%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
dst, dst_stride, \
- fx, fxs, fy, fys, w, h
+ f, fxo, fxs, fyo, fys, w, h
%endif
mov r4d, dword wm
%ifidn %2, highbd
diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
index bfc816f23..d83507dc9 100644
--- a/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -197,6 +197,8 @@
movdqu [rdi + %2], xmm0
%endm
+SECTION .text
+
;void vpx_filter_block1d4_v8_sse2
;(
; unsigned char *src_ptr,
diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
index 72f2ff71d..9bffe504b 100644
--- a/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -171,6 +171,8 @@
%endm
%endif
+SECTION .text
+
global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
sym(vpx_highbd_filter_block1d4_v2_sse2):
push rbp
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 7c1ecc014..d0919695c 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -12,9 +12,10 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
#include "vpx_ports/mem.h"
-// filters for 16_h8 and 16_v8
+// filters for 16_h8
DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
@@ -35,493 +36,296 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) || \
- (__clang_major__ == 3 && __clang_minor__ <= 3) || \
- (defined(__APPLE__) && defined(__apple_build_version__) && \
- ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
- (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // gcc <= 4.6
-#else // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // __clang__
-
-static void vpx_filter_block1d16_h8_avx2(
+static INLINE void vpx_filter_block1d16_h8_x_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
- __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i outReg1, outReg2;
+ __m256i outReg32b1, outReg32b2;
unsigned int i;
ptrdiff_t src_stride, dst_stride;
+ __m256i f[4], filt[4], s[4];
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
- filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ shuffle_filter_avx2(filter, f);
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
// multiple the size of the source and destination stride by two
src_stride = src_pixels_per_line << 1;
dst_stride = output_pitch << 1;
for (i = output_height; i > 1; i -= 2) {
+ __m256i srcReg;
+
// load the 2 strides of source
- srcReg32b1 =
+ srcReg =
_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg32b1 = _mm256_inserti128_si256(
- srcReg32b1,
+ srcReg = _mm256_inserti128_si256(
+ srcReg,
_mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
1);
// filter the source buffer
- srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(
- srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+ outReg32b1 = convolve8_16_avx2(s, f);
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
- srcReg32b2 =
+ srcReg =
_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg32b2 = _mm256_inserti128_si256(
- srcReg32b2,
+ srcReg = _mm256_inserti128_si256(
+ srcReg,
_mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
1);
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(
- srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- // filter the source buffer
- srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
// filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+ outReg32b2 = convolve8_16_avx2(s, f);
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(
- srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
- srcRegFilt32b2_1 = _mm256_adds_epi16(
- srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
- srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2);
src_ptr += src_stride;
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(outReg32b1);
+ outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+ }
+
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr,
- _mm256_castsi256_si128(srcRegFilt32b1_1));
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
// save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + output_pitch),
- _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
+
output_ptr += dst_stride;
}
// if the number of strides is odd.
// process only 16 bytes
if (i > 0) {
- __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
- __m128i srcRegFilt2, srcRegFilt3;
+ __m128i srcReg;
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ // load the first 16 bytes of the last row
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 =
- _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 =
- _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ s[0] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+ s[1] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+ s[2] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+ s[3] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+ outReg1 = convolve8_8_avx2(s, f);
// reading the next 16 bytes
// (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
- // add and saturate the results together
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
// filter the source buffer
- srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2_1 =
- _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 =
- _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
-
- srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+ s[0] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+ s[1] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+ s[2] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+ s[3] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+ outReg2 = convolve8_8_avx2(s, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ outReg1 = _mm_packus_epi16(outReg1, outReg2);
+
+ // average if necessary
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ }
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
}
}
-static void vpx_filter_block1d16_v8_avx2(
+static void vpx_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 1);
+}
+
+static INLINE void vpx_filter_block1d16_v8_x_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg64;
- __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
- __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
- __m256i srcReg32b11, srcReg32b12, filtersReg32;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i outReg1, outReg2;
+ __m256i srcRegHead1;
unsigned int i;
ptrdiff_t src_stride, dst_stride;
+ __m256i f[4], s1[4], s2[4];
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+ shuffle_filter_avx2(filter, f);
// multiple the size of the source and destination stride by two
src_stride = src_pitch << 1;
dst_stride = out_pitch << 1;
- // load 16 bytes 7 times in stride of src_pitch
- srcReg32b1 =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
- srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
- srcReg32b3 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
- srcReg32b4 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
- srcReg32b5 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
- srcReg32b6 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
- srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
- // have each consecutive loads on the same 256 register
- srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm256_castsi256_si128(srcReg32b2), 1);
- srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm256_castsi256_si128(srcReg32b3), 1);
- srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
- _mm256_castsi256_si128(srcReg32b4), 1);
- srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
- _mm256_castsi256_si128(srcReg32b5), 1);
- srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
- _mm256_castsi256_si128(srcReg32b6), 1);
- srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
- _mm256_castsi256_si128(srcReg32b7), 1);
-
- // merge every two consecutive registers except the last one
- srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
- srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
- // save
- srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
- // save
- srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
- // save
- srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
- // save
- srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+ {
+ __m128i s[6];
+ __m256i s32b[6];
+
+ // load 16 bytes 7 times in stride of src_pitch
+ s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch));
+ srcRegHead1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch)));
+
+ // have each consecutive loads on the same 256 register
+ s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+ s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+ s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+ s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+ s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+ s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]),
+ _mm256_castsi256_si128(srcRegHead1), 1);
+
+ // merge every two consecutive registers except the last one
+ // the first lanes contain values for filtering odd rows (1,3,5...) and
+ // the second lanes contain values for filtering even rows (2,4,6...)
+ s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]);
+ s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]);
+ s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]);
+ s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]);
+ s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]);
+ s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
+ }
for (i = output_height; i > 1; i -= 2) {
- // load the last 2 loads of 16 bytes and have every two
+ __m256i srcRegHead2, srcRegHead3;
+
+ // load the next 2 loads of 16 bytes and have every two
// consecutive loads in the same 256 bit register
- srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
- srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
- _mm256_castsi256_si128(srcReg32b8), 1);
- srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
- srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
- _mm256_castsi256_si128(srcReg32b9), 1);
-
- // merge every two consecutive registers
- // save
- srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
- srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
- // add and saturate the results together
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
- srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+ srcRegHead2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch)));
+ srcRegHead1 = _mm256_inserti128_si256(
+ srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1);
+ srcRegHead3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch)));
+ srcRegHead2 = _mm256_inserti128_si256(
+ srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1);
+
+ // merge the two new consecutive registers
+ // the first lane contain values for filtering odd rows (1,3,5...) and
+ // the second lane contain values for filtering even rows (2,4,6...)
+ s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2);
+ s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2);
+
+ s1[0] = convolve8_16_avx2(s1, f);
+ s2[0] = convolve8_16_avx2(s2, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ s1[0] = _mm256_packus_epi16(s1[0], s2[0]);
src_ptr += src_stride;
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(s1[0]);
+ outReg2 = _mm256_extractf128_si256(s1[0], 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+ }
+
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
// save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + out_pitch),
- _mm256_extractf128_si256(srcReg32b1, 1));
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
output_ptr += dst_stride;
- // save part of the registers for next strides
- srcReg32b10 = srcReg32b11;
- srcReg32b1 = srcReg32b3;
- srcReg32b11 = srcReg32b2;
- srcReg32b3 = srcReg32b5;
- srcReg32b2 = srcReg32b4;
- srcReg32b5 = srcReg32b7;
- srcReg32b7 = srcReg32b9;
+ // shift down by two rows
+ s1[0] = s1[1];
+ s2[0] = s2[1];
+ s1[1] = s1[2];
+ s2[1] = s2[2];
+ s1[2] = s1[3];
+ s2[2] = s2[3];
+ srcRegHead1 = srcRegHead3;
}
+
+ // if the number of strides is odd.
+ // process only 16 bytes
if (i > 0) {
- __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
- __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
// load the last 16 bytes
- srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+ const __m128i srcRegHead2 =
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
// merge the last 2 results together
- srcRegFilt4 =
- _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
- srcRegFilt7 =
- _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt4 =
- _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
- srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt7 =
- _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
- _mm256_castsi256_si128(secondFilters));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
- _mm256_castsi256_si128(thirdFilters));
- srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
-
- // add and saturate the results together
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
- srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
- srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+ s1[0] = _mm256_castsi128_si256(
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
+ s2[0] = _mm256_castsi128_si256(
+ _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
+
+ outReg1 = convolve8_8_avx2(s1, f);
+ outReg2 = convolve8_8_avx2(s2, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ outReg1 = _mm_packus_epi16(outReg1, outReg2);
+
+ // average if necessary
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ }
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
}
}
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *filter) {
+ vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 0);
+}
+
+static void vpx_filter_block1d16_v8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+ vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 1);
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
#if ARCH_X86_64
@@ -539,6 +343,14 @@ filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
#endif // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
@@ -552,23 +364,53 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
FUN_CONV_2D(, avx2);
+FUN_CONV_2D(avg_, avx2);
#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 09c75d455..e4f992780 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -8,52 +8,37 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <tmmintrin.h>
+#include <tmmintrin.h> // SSSE3
+
+#include <string.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/vpx_filter.h"
#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
// These are reused by the avx2 intrinsics.
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+// vpx_filter_block1d8_v8_intrin_ssse3()
+// vpx_filter_block1d8_h8_intrin_ssse3()
+// vpx_filter_block1d4_h8_intrin_ssse3()
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+ const __m128i *const s, const int16_t *const filter) {
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+ return convolve8_8_ssse3(s, f);
+}
void vpx_filter_block1d4_h8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, srcReg, minReg;
+ __m128i srcRegFilt1, srcRegFilt2;
+ __m128i addFilterReg64, filtersReg, srcReg;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -75,8 +60,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
// loading the local filters
- shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
- shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+ shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6);
+ shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10);
for (i = 0; i < output_height; i++) {
srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
@@ -89,25 +74,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
- // extract the higher half of the lane
- srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
- srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+ // sum the results together, saturating only on the final step
+ // the specific order of the additions prevents outranges
+ srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
- minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+ // extract the higher half of the register
+ srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
- // add and saturate all the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+ // add the rounding offset early to avoid another saturated add
+ srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
// shift by 7 bit each 16 bits
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr += src_pixels_per_line;
+ src_ptr += src_pitch;
// save only 4 bytes
*((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
@@ -117,77 +100,35 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
}
void vpx_filter_block1d8_h8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, minReg;
unsigned int i;
+ __m128i f[4], filt[4], s[4];
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+ shuffle_filter_ssse3(filter, f);
+ filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+ filt[3] =
+ _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
- srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
- srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
- srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
- // add and saturate all the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+ s[0] = _mm_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm_shuffle_epi8(srcReg, filt[3]);
+ s[0] = convolve8_8_ssse3(s, f);
// shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+ s[0] = _mm_packus_epi16(s[0], s[0]);
- src_ptr += src_pixels_per_line;
+ src_ptr += src_pitch;
// save only 8 bytes
- _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+ _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
output_ptr += output_pitch;
}
@@ -196,83 +137,49 @@ void vpx_filter_block1d8_h8_intrin_ssse3(
void vpx_filter_block1d8_v8_intrin_ssse3(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
- __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
- __m128i srcReg8;
unsigned int i;
+ __m128i f[4], s[8], ss[4];
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+ shuffle_filter_ssse3(filter, f);
// load the first 7 rows of 8 bytes
- srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
- srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
for (i = 0; i < output_height; i++) {
// load the last 8 bytes
- srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
// merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
// merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
- srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
- // add and saturate the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ ss[0] = convolve8_8_ssse3(ss, f);
// shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+ ss[0] = _mm_packus_epi16(ss[0], ss[0]);
src_ptr += src_pitch;
// shift down a row
- srcReg1 = srcReg2;
- srcReg2 = srcReg3;
- srcReg3 = srcReg4;
- srcReg4 = srcReg5;
- srcReg5 = srcReg6;
- srcReg6 = srcReg7;
- srcReg7 = srcReg8;
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
// save only 8 bytes convolve result
- _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+ _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]);
output_ptr += out_pitch;
}
@@ -306,149 +213,69 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
- ssse3);
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
- const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
- const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
- const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
- \
- const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
- const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
- const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
- const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
- \
- out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
- out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
- out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
- out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
- out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
- out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
- out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
- out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
- }
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *x_filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
- const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
- const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
- const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
- const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
- // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
- const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
- // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
- const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
- // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
- const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
- // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
- const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
- // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
- const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
- const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
- const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
- const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
- const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
- const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const x_filter) {
+ __m128i s[8], ss[4], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, ss);
+ temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
// shrink to 8 bit each 16 bits
temp = _mm_packus_epi16(temp, temp);
// save only 8 bytes convolve result
_mm_storel_epi64((__m128i *)dst, temp);
}
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride) {
- __m128i A, B, C, D, E, F, G, H;
-
- A = _mm_loadl_epi64((const __m128i *)src);
- B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
- C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
- E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
- F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
- G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
- H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
- TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
- _mm_storel_epi64((__m128i *)dst, A);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
+static void transpose8x8_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[8];
+
+ load_8bit_8x8(src, src_stride, s);
+ transpose_8bit_8x8(s, s);
+ store_8bit_8x8(s, dst, dst_stride);
}
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
int x, y, z;
src -= SUBPEL_TAPS / 2 - 1;
- // This function processes 8x8 areas. The intermediate height is not always
+ // This function processes 8x8 areas. The intermediate height is not always
// a multiple of 8, so force it to be a multiple of 8 here.
y = h + (8 - (h & 0x7));
@@ -479,93 +306,50 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
} while (y -= 8);
}
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- // TRANSPOSE...
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- //
- // TO
- //
- // 00 10 20 30
- // 01 11 21 31
- // 02 12 22 32
- // 03 13 23 33
- // 04 14 24 34
- // 05 15 25 35
- // 06 16 26 36
- // 07 17 27 37
- //
- // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
- const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
- // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
- const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
- // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
- const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[4], ss[2];
+ __m128i temp;
+
+ load_8bit_8x4(src, src_stride, s);
+ transpose_16bit_4x4(s, ss);
+ // 00 01 10 11 20 21 30 31
+ s[0] = ss[0];
// 02 03 12 13 22 23 32 33
- const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
+ s[1] = _mm_srli_si128(ss[0], 8);
+ // 04 05 14 15 24 25 34 35
+ s[2] = ss[1];
// 06 07 16 17 26 27 36 37
- const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
+ s[3] = _mm_srli_si128(ss[1], 8);
+
+ temp = shuffle_filter_convolve8_8_ssse3(s, filter);
// shrink to 8 bit each 16 bits
temp = _mm_packus_epi16(temp, temp);
// save only 4 bytes
*(int *)dst = _mm_cvtsi128_si32(temp);
}
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride) {
- __m128i A = _mm_cvtsi32_si128(*(const int *)src);
- __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
- __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
- __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
- // 00 10 01 11 02 12 03 13
- const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
- // 20 30 21 31 22 32 23 33
- const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- A = _mm_unpacklo_epi16(tr0_0, tr0_1);
- B = _mm_srli_si128(A, 4);
- C = _mm_srli_si128(A, 8);
- D = _mm_srli_si128(A, 12);
-
- *(int *)(dst) = _mm_cvtsi128_si32(A);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
- *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
- *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
+static void transpose4x4_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[4];
+
+ load_8bit_4x4(src, src_stride, s);
+ s[0] = transpose_8bit_4x4(s);
+ s[1] = _mm_srli_si128(s[0], 4);
+ s[2] = _mm_srli_si128(s[0], 8);
+ s[3] = _mm_srli_si128(s[0], 12);
+ store_8bit_4x4(s, dst, dst_stride);
}
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
int x, y, z;
src -= SUBPEL_TAPS / 2 - 1;
@@ -597,50 +381,41 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
- const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
- const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
- const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
- const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
- const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
- const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
- const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
- const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
- const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
- const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
+static __m128i filter_vert_kernel(const __m128i *const s,
+ const int16_t *const filter) {
+ __m128i ss[4];
+ __m128i temp;
+
+ // 00 10 01 11 02 12 03 13
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ // 20 30 21 31 22 32 23 33
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ // 40 50 41 51 42 52 43 53
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ // 60 70 61 71 62 72 63 73
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+ temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
// shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
+ return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8];
+ __m128i temp;
+
+ load_8bit_4x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
// save only 4 bytes
*(int *)dst = _mm_cvtsi128_si32(temp);
}
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
int y;
int y_q4 = y0_q4;
@@ -659,50 +434,21 @@ static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
- const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
- const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
- const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
- const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
- const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
// save only 8 bytes convolve result
_mm_storel_epi64((__m128i *)dst, temp);
}
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
int y;
int y_q4 = y0_q4;
@@ -719,81 +465,44 @@ static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter, int w) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+static void filter_vert_w16_ssse3(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter, const int w) {
int i;
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
for (i = 0; i < w; i += 16) {
- const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
- const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
- const __m128i C =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
- const __m128i E =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
- const __m128i F =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
- const __m128i G =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
- const __m128i H =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
- // merge the result together
- const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
- const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
- const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
- const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
- const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
- const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
- const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
- // add and saturate the results together
- const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
- const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
- // merge the result together
- const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
- const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
- const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
+ __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+ loadu_8bit_16x8(src, src_stride, s);
+
// merge the result together
- const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
- const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
- const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
- // add and saturate the results together
- __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
- __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
- // add and saturate the results together
- temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
- temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
- // round and shift by 7 bit each 16 bit
- temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
- temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
+ s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+ temp_lo = convolve8_8_ssse3(s_lo, f);
+ temp_hi = convolve8_8_ssse3(s_hi, f);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+ // result and the second lane contain the second convolve result
temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
- src_ptr += 16;
+ src += 16;
// save 16 bytes convolve result
_mm_store_si128((__m128i *)&dst[i], temp_hi);
}
}
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
int y;
int y_q4 = y0_q4;
@@ -811,11 +520,10 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters, int x0_q4,
- int x_step_q4, const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -829,60 +537,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
// --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
const int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= 64);
assert(h <= 64);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
if (w >= 8) {
scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
- w, intermediate_height);
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
} else {
scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
- w, intermediate_height);
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
}
if (w >= 16) {
scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
} else if (w == 8) {
scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
} else {
scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
}
}
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, ssse3);
FUN_CONV_2D(avg_, ssse3);
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
index 08f3d6a6c..8497e1721 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -176,6 +176,8 @@
movq [rdi + %2], xmm0
%endm
+SECTION .text
+
;void vpx_filter_block1d4_v8_sse2
;(
; unsigned char *src_ptr,
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index c1a6f23ab..952d9307d 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -327,12 +327,12 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
%endm
INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
-SUBPIX_HFILTER8 h8
-SUBPIX_HFILTER8 h8_avg
-SUBPIX_HFILTER4 h8
-SUBPIX_HFILTER4 h8_avg
+SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3
+SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3
+SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3
+SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3
+SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3
+SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3
;-------------------------------------------------------------------------------
@@ -795,9 +795,9 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
%endm
INIT_XMM ssse3
-SUBPIX_VFILTER16 v8
-SUBPIX_VFILTER16 v8_avg
-SUBPIX_VFILTER v8, 8
-SUBPIX_VFILTER v8_avg, 8
-SUBPIX_VFILTER v8, 4
-SUBPIX_VFILTER v8_avg, 4
+SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3
+SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3
+SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3
+SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3
+SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3
+SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
index a378dd040..6d79492e4 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -131,6 +131,8 @@
dec rcx
%endm
+SECTION .text
+
global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
sym(vpx_filter_block1d4_v2_sse2):
push rbp
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index 538b2129d..8c9c817be 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -105,6 +105,8 @@
dec rcx
%endm
+SECTION .text
+
global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
sym(vpx_filter_block1d4_v2_ssse3):
push rbp
diff --git a/libvpx/vpx_mem/vpx_mem.c b/libvpx/vpx_mem/vpx_mem.c
index a9be08680..eeba34c37 100644
--- a/libvpx/vpx_mem/vpx_mem.c
+++ b/libvpx/vpx_mem/vpx_mem.c
@@ -82,12 +82,3 @@ void vpx_free(void *memblk) {
free(addr);
}
}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length) {
- size_t i;
- uint16_t *dest16 = (uint16_t *)dest;
- for (i = 0; i < length; i++) *dest16++ = val;
- return dest;
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_mem/vpx_mem.h b/libvpx/vpx_mem/vpx_mem.h
index 733aff488..a4274b885 100644
--- a/libvpx/vpx_mem/vpx_mem.h
+++ b/libvpx/vpx_mem/vpx_mem.h
@@ -19,6 +19,8 @@
#include <stdlib.h>
#include <stddef.h>
+#include "vpx/vpx_integer.h"
+
#if defined(__cplusplus)
extern "C" {
#endif
@@ -29,7 +31,12 @@ void *vpx_calloc(size_t num, size_t size);
void vpx_free(void *memblk);
#if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length);
+static INLINE void *vpx_memset16(void *dest, int val, size_t length) {
+ size_t i;
+ uint16_t *dest16 = (uint16_t *)dest;
+ for (i = 0; i < length; i++) *dest16++ = val;
+ return dest;
+}
#endif
#include <string.h>
diff --git a/libvpx/vpx_ports/asmdefs_mmi.h b/libvpx/vpx_ports/asmdefs_mmi.h
new file mode 100644
index 000000000..a9a49745a
--- /dev/null
+++ b/libvpx/vpx_ports/asmdefs_mmi.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_ASMDEFS_MMI_H_
+#define VPX_PORTS_ASMDEFS_MMI_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_MMI
+
+#if HAVE_MIPS64
+#define mips_reg int64_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+ "daddu " #reg1 ", " #reg2 ", " #reg3 " \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+ "daddiu " #reg1 ", " #reg2 ", " #immediate " \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+ "daddi " #reg1 ", " #reg2 ", " #immediate " \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+ "dsubu " #reg1 ", " #reg2 ", " #reg3 " \n\t"
+
+#define MMI_L(reg, addr, bias) \
+ "ld " #reg ", " #bias "(" #addr ") \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+ "dsrl " #reg1 ", " #reg2 ", " #shift " \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+ "dsll " #reg1 ", " #reg2 ", " #shift " \n\t"
+
+#define MMI_MTC1(reg, fp) \
+ "dmtc1 " #reg ", " #fp " \n\t"
+
+#define MMI_LI(reg, immediate) \
+ "dli " #reg ", " #immediate " \n\t"
+
+#else
+#define mips_reg int32_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+ "addu " #reg1 ", " #reg2 ", " #reg3 " \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+ "addiu " #reg1 ", " #reg2 ", " #immediate " \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+ "addi " #reg1 ", " #reg2 ", " #immediate " \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+ "subu " #reg1 ", " #reg2 ", " #reg3 " \n\t"
+
+#define MMI_L(reg, addr, bias) \
+ "lw " #reg ", " #bias "(" #addr ") \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+ "srl " #reg1 ", " #reg2 ", " #shift " \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+ "sll " #reg1 ", " #reg2 ", " #shift " \n\t"
+
+#define MMI_MTC1(reg, fp) \
+ "mtc1 " #reg ", " #fp " \n\t"
+
+#define MMI_LI(reg, immediate) \
+ "li " #reg ", " #immediate " \n\t"
+
+#endif /* HAVE_MIPS64 */
+
+#endif /* HAVE_MMI */
+
+#endif /* VPX_PORTS_ASMDEFS_MMI_H_ */
diff --git a/libvpx/vpx_ports/vpx_ports.mk b/libvpx/vpx_ports/vpx_ports.mk
index fc0a783b7..e17145e6c 100644
--- a/libvpx/vpx_ports/vpx_ports.mk
+++ b/libvpx/vpx_ports/vpx_ports.mk
@@ -28,3 +28,7 @@ PORTS_SRCS-$(ARCH_ARM) += arm.h
PORTS_SRCS-$(ARCH_PPC) += ppc_cpudetect.c
PORTS_SRCS-$(ARCH_PPC) += ppc.h
+
+ifeq ($(ARCH_MIPS), yes)
+PORTS_SRCS-yes += asmdefs_mmi.h
+endif
diff --git a/libvpx/vpx_ports/x86.h b/libvpx/vpx_ports/x86.h
index 5aabb9e3a..ced65ac05 100644
--- a/libvpx/vpx_ports/x86.h
+++ b/libvpx/vpx_ports/x86.h
@@ -151,16 +151,17 @@ static INLINE uint64_t xgetbv(void) {
#endif
#endif
-#define HAS_MMX 0x01
-#define HAS_SSE 0x02
-#define HAS_SSE2 0x04
-#define HAS_SSE3 0x08
-#define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
-#define HAS_AVX 0x40
-#define HAS_AVX2 0x80
+#define HAS_MMX 0x001
+#define HAS_SSE 0x002
+#define HAS_SSE2 0x004
+#define HAS_SSE3 0x008
+#define HAS_SSSE3 0x010
+#define HAS_SSE4_1 0x020
+#define HAS_AVX 0x040
+#define HAS_AVX2 0x080
+#define HAS_AVX512 0x100
#ifndef BIT
-#define BIT(n) (1 << n)
+#define BIT(n) (1u << n)
#endif
static INLINE int x86_simd_caps(void) {
@@ -209,6 +210,12 @@ static INLINE int x86_simd_caps(void) {
cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+
+ // bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) &
+ // 30 (AVX-512BW) & 32 (AVX-512VL)
+ if ((reg_ebx & (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) ==
+ (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31)))
+ flags |= HAS_AVX512;
}
}
}
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index a674eac84..9c7ca42c7 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -9,6 +9,7 @@
*/
#include <assert.h>
+#include <limits.h>
#include "vpx_scale/yv12config.h"
#include "vpx_mem/vpx_mem.h"
@@ -165,6 +166,12 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
uint8_t *buf = NULL;
+ // frame_size is stored in buffer_alloc_sz, which is an int. If it won't
+ // fit, fail early.
+ if (frame_size > INT_MAX) {
+ return -1;
+ }
+
if (cb != NULL) {
const int align_addr_extra_size = 31;
const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -193,8 +200,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
vpx_free(ybf->buffer_alloc);
ybf->buffer_alloc = NULL;
- if (frame_size != (size_t)frame_size) return -1;
-
ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size);
if (!ybf->buffer_alloc) return -1;
diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c
index a6aaff95a..e23180650 100644
--- a/libvpx/vpx_scale/generic/yv12extend.c
+++ b/libvpx/vpx_scale/generic/yv12extend.c
@@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
assert(ybf->y_height - ybf->y_crop_height >= 0);
assert(ybf->y_width - ybf->y_crop_width >= 0);
-#if CONFIG_VP9_HIGHBITDEPTH
- if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
- extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
- ybf->y_crop_height, ybf->border, ybf->border,
- ybf->border + ybf->y_height - ybf->y_crop_height,
- ybf->border + ybf->y_width - ybf->y_crop_width);
-
- extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width,
- ybf->uv_crop_height, uv_border, uv_border,
- uv_border + ybf->uv_height - ybf->uv_crop_height,
- uv_border + ybf->uv_width - ybf->uv_crop_width);
-
- extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width,
- ybf->uv_crop_height, uv_border, uv_border,
- uv_border + ybf->uv_height - ybf->uv_crop_height,
- uv_border + ybf->uv_width - ybf->uv_crop_width);
- return;
- }
-#endif
extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
ybf->y_crop_height, ybf->border, ybf->border,
ybf->border + ybf->y_height - ybf->y_crop_height,
@@ -208,6 +189,7 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
// Copies the source image into the destination image and updates the
// destination's UMV borders.
// Note: The frames are assumed to be identical in size.
+
void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc) {
int row;
@@ -222,6 +204,48 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
assert(src_ybc->y_height == dst_ybc->y_height);
#endif
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ memcpy(dst, src, src_ybc->y_width);
+ src += src_ybc->y_stride;
+ dst += dst_ybc->y_stride;
+ }
+
+ src = src_ybc->u_buffer;
+ dst = dst_ybc->u_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ src = src_ybc->v_buffer;
+ dst = dst_ybc->v_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ vp8_yv12_extend_frame_borders_c(dst_ybc);
+}
+
+#if CONFIG_VP9
+void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc) {
+ int row;
+ const uint8_t *src = src_ybc->y_buffer;
+ uint8_t *dst = dst_ybc->y_buffer;
+
+#if 0
+ /* These assertions are valid in the codec, but the libvpx-tester uses
+ * this code slightly differently.
+ */
+ assert(src_ybc->y_width == dst_ybc->y_width);
+ assert(src_ybc->y_height == dst_ybc->y_height);
+#endif
+
#if CONFIG_VP9_HIGHBITDEPTH
if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH);
@@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
dst += dst_ybc->uv_stride;
}
- vp8_yv12_extend_frame_borders_c(dst_ybc);
+ vpx_extend_frame_borders_c(dst_ybc);
return;
} else {
assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
@@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
dst += dst_ybc->uv_stride;
}
- vp8_yv12_extend_frame_borders_c(dst_ybc);
+ vpx_extend_frame_borders_c(dst_ybc);
}
+#endif // CONFIG_VP9
void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc) {
diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.pl b/libvpx/vpx_scale/vpx_scale_rtcd.pl
index 44b115c7e..1281071a7 100644
--- a/libvpx/vpx_scale/vpx_scale_rtcd.pl
+++ b/libvpx/vpx_scale/vpx_scale_rtcd.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
sub vpx_scale_forward_decls() {
print <<EOF
struct yv12_buffer_config;
@@ -23,6 +33,8 @@ add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_yb
add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
if (vpx_config("CONFIG_VP9") eq "yes") {
+ add_proto qw/void vpx_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
specialize qw/vpx_extend_frame_borders dspr2/;
diff --git a/libvpx/vpx_util/vpx_atomics.h b/libvpx/vpx_util/vpx_atomics.h
new file mode 100644
index 000000000..b8cf80dae
--- /dev/null
+++ b/libvpx/vpx_util/vpx_atomics.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_UTIL_VPX_ATOMICS_H_
+#define VPX_UTIL_VPX_ATOMICS_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+// Look for built-in atomic support. We cannot use <stdatomic.h> or <atomic>
+// since neither is guaranteed to exist on both C and C++ platforms, and we need
+// to back the atomic type with the same type (g++ needs to be able to use
+// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the
+// stdatomic.h header. Even if both <stdatomic.h> and <atomic> existed it's not
+// guaranteed that atomic_int is the same type as std::atomic_int.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13.
+#if !defined(__has_builtin)
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
+#endif // !defined(__has_builtin)
+
+#if (__has_builtin(__atomic_load_n)) || \
+ (defined(__GNUC__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those.
+#define VPX_USE_ATOMIC_BUILTINS
+#else
+// Use platform-specific asm barriers.
+#if defined(_MSC_VER)
+// TODO(pbos): This assumes that newer versions of MSVC are building with the
+// default /volatile:ms (or older, where this is always true. Consider adding
+// support for using <atomic> instead of stdatomic.h when building C++11 under
+// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?),
+// there're no explicit Interlocked* functions for only storing or loading
+// (presumably because volatile has historically implied that on MSVC).
+//
+// For earlier versions of MSVC or the default /volatile:ms volatile int are
+// acquire/release and require no barrier.
+#define vpx_atomic_memory_barrier() \
+ do { \
+ } while (0)
+#else
+#if ARCH_X86 || ARCH_X86_64
+// Use a compiler barrier on x86, no runtime penalty.
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
+#elif ARCH_ARM
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
+#elif ARCH_MIPS
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory")
+#else
+#error Unsupported architecture!
+#endif // ARCH_X86 || ARCH_X86_64
+#endif // defined(_MSC_VER)
+#endif // atomic builtin availability check
+
+// These are wrapped in a struct so that they are not easily accessed directly
+// on any platform (to discourage programmer errors by setting values directly).
+// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT
+// (NOT memset) and accessed through vpx_atomic_ functions.
+typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int;
+
+#define VPX_ATOMIC_INIT(num) \
+ { num }
+
+// Initialization of an atomic int, not thread safe.
+static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) {
+ atomic->value = value;
+}
+
+static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+ __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE);
+#else
+ vpx_atomic_memory_barrier();
+ atomic->value = value;
+#endif // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+ return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE);
+#else
+ int v = atomic->value;
+ vpx_atomic_memory_barrier();
+ return v;
+#endif // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+#undef VPX_USE_ATOMIC_BUILTINS
+#undef vpx_atomic_memory_barrier
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // VPX_UTIL_VPX_ATOMICS_H_
diff --git a/libvpx/vpx_util/vpx_util.mk b/libvpx/vpx_util/vpx_util.mk
index c0ef8d336..86d3ece3c 100644
--- a/libvpx/vpx_util/vpx_util.mk
+++ b/libvpx/vpx_util/vpx_util.mk
@@ -8,7 +8,10 @@
## be found in the AUTHORS file in the root of the source tree.
##
+UTIL_SRCS-yes += vpx_atomics.h
UTIL_SRCS-yes += vpx_util.mk
UTIL_SRCS-yes += vpx_thread.c
UTIL_SRCS-yes += vpx_thread.h
UTIL_SRCS-yes += endian_inl.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.c
diff --git a/libvpx/vpx_util/vpx_write_yuv_frame.c b/libvpx/vpx_util/vpx_write_yuv_frame.c
new file mode 100644
index 000000000..ab6855811
--- /dev/null
+++ b/libvpx/vpx_util/vpx_write_yuv_frame.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
+ defined(OUTPUT_YUV_SKINMAP)
+
+ unsigned char *src = s->y_buffer;
+ int h = s->y_crop_height;
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+#else
+ (void)yuv_file;
+ (void)s;
+#endif
+}
diff --git a/libvpx/vpx_util/vpx_write_yuv_frame.h b/libvpx/vpx_util/vpx_write_yuv_frame.h
new file mode 100644
index 000000000..1cb702981
--- /dev/null
+++ b/libvpx/vpx_util/vpx_write_yuv_frame.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#define VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+
+#include <stdio.h>
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
diff --git a/libvpx/vpxdec.c b/libvpx/vpxdec.c
index 6db2afb4a..ff20e6a3c 100644
--- a/libvpx/vpxdec.c
+++ b/libvpx/vpxdec.c
@@ -47,6 +47,8 @@ struct VpxDecInputContext {
struct WebmInputContext *webm_ctx;
};
+static const arg_def_t help =
+ ARG_DEF(NULL, "help", 0, "Show usage options and exit");
static const arg_def_t looparg =
ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
@@ -98,17 +100,17 @@ static const arg_def_t framestatsarg =
ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
static const arg_def_t *all_args[] = {
- &codecarg, &use_yv12, &use_i420,
- &flipuvarg, &rawvideo, &noblitarg,
- &progressarg, &limitarg, &skiparg,
- &postprocarg, &summaryarg, &outputfile,
- &threadsarg, &frameparallelarg, &verbosearg,
- &scalearg, &fb_arg, &md5arg,
- &error_concealment, &continuearg,
+ &help, &codecarg, &use_yv12,
+ &use_i420, &flipuvarg, &rawvideo,
+ &noblitarg, &progressarg, &limitarg,
+ &skiparg, &postprocarg, &summaryarg,
+ &outputfile, &threadsarg, &frameparallelarg,
+ &verbosearg, &scalearg, &fb_arg,
+ &md5arg, &error_concealment, &continuearg,
#if CONFIG_VP9_HIGHBITDEPTH
&outbitdeptharg,
#endif
- &svcdecodingarg, &framestatsarg, NULL
+ &svcdecodingarg, &framestatsarg, NULL
};
#if CONFIG_VP8_DECODER
@@ -152,41 +154,47 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
dst->d_h, mode);
}
#endif
-
-void usage_exit(void) {
+void show_help(FILE *fout, int shorthelp) {
int i;
- fprintf(stderr,
- "Usage: %s <options> filename\n\n"
- "Options:\n",
- exec_name);
- arg_show_usage(stderr, all_args);
+ fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
+
+ if (shorthelp) {
+ fprintf(fout, "Use --help to see the full list of options.\n");
+ return;
+ }
+
+ fprintf(fout, "Options:\n");
+ arg_show_usage(fout, all_args);
#if CONFIG_VP8_DECODER
- fprintf(stderr, "\nVP8 Postprocessing Options:\n");
- arg_show_usage(stderr, vp8_pp_args);
+ fprintf(fout, "\nVP8 Postprocessing Options:\n");
+ arg_show_usage(fout, vp8_pp_args);
#endif
- fprintf(stderr,
+ fprintf(fout,
"\nOutput File Patterns:\n\n"
" The -o argument specifies the name of the file(s) to "
"write to. If the\n argument does not include any escape "
"characters, the output will be\n written to a single file. "
"Otherwise, the filename will be calculated by\n expanding "
"the following escape characters:\n");
- fprintf(stderr,
+ fprintf(fout,
"\n\t%%w - Frame width"
"\n\t%%h - Frame height"
"\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
"\n\n Pattern arguments are only supported in conjunction "
"with the --yv12 and\n --i420 options. If the -o option is "
"not specified, the output will be\n directed to stdout.\n");
- fprintf(stderr, "\nIncluded decoders:\n\n");
+ fprintf(fout, "\nIncluded decoders:\n\n");
for (i = 0; i < get_vpx_decoder_count(); ++i) {
const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
- fprintf(stderr, " %-6s - %s\n", decoder->name,
+ fprintf(fout, " %-6s - %s\n", decoder->name,
vpx_codec_iface_name(decoder->codec_interface()));
}
+}
+void usage_exit(void) {
+ show_help(stderr, 1);
exit(EXIT_FAILURE);
}
@@ -554,7 +562,10 @@ static int main_loop(int argc, const char **argv_) {
memset(&arg, 0, sizeof(arg));
arg.argv_step = 1;
- if (arg_match(&arg, &codecarg, argi)) {
+ if (arg_match(&arg, &help, argi)) {
+ show_help(stdout, 0);
+ exit(EXIT_SUCCESS);
+ } else if (arg_match(&arg, &codecarg, argi)) {
interface = get_vpx_decoder_by_name(arg.val);
if (!interface)
die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -651,6 +662,7 @@ static int main_loop(int argc, const char **argv_) {
if (!fn) {
free(argv);
+ fprintf(stderr, "No input file specified!\n");
usage_exit();
}
/* Open file */
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index 6c887dfeb..4db7eccc3 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -123,6 +123,8 @@ static int fourcc_is_ivf(const char detect[4]) {
return 0;
}
+static const arg_def_t help =
+ ARG_DEF(NULL, "help", 0, "Show usage options and exit");
static const arg_def_t debugmode =
ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
static const arg_def_t outputfile =
@@ -199,7 +201,8 @@ static const arg_def_t test16bitinternalarg = ARG_DEF(
NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
#endif
-static const arg_def_t *main_args[] = { &debugmode,
+static const arg_def_t *main_args[] = { &help,
+ &debugmode,
&outputfile,
&codecarg,
&passes,
@@ -321,8 +324,11 @@ static const arg_def_t minsection_pct =
ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
static const arg_def_t maxsection_pct =
ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
-static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
- &maxsection_pct, NULL };
+static const arg_def_t corpus_complexity =
+ ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint");
+static const arg_def_t *rc_twopass_args[] = {
+ &bias_pct, &minsection_pct, &maxsection_pct, &corpus_complexity, NULL
+};
static const arg_def_t kf_min_dist =
ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
@@ -441,8 +447,8 @@ static const struct arg_enum_list color_space_enum[] = {
};
static const arg_def_t input_color_space =
- ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:",
- color_space_enum);
+ ARG_DEF_ENUM(NULL, "color-space", 1,
+ "The color space of input content:", color_space_enum);
#if CONFIG_VP9_HIGHBITDEPTH
static const struct arg_enum_list bitdepth_enum[] = {
@@ -460,6 +466,7 @@ static const arg_def_t inbitdeptharg =
static const struct arg_enum_list tune_content_enum[] = {
{ "default", VP9E_CONTENT_DEFAULT },
{ "screen", VP9E_CONTENT_SCREEN },
+ { "film", VP9E_CONTENT_FILM },
{ NULL, 0 }
};
@@ -468,8 +475,14 @@ static const arg_def_t tune_content = ARG_DEF_ENUM(
static const arg_def_t target_level = ARG_DEF(
NULL, "target-level", 1,
- "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
- " 11: level 1.1; ... 62: level 6.2)");
+ "Target level\n"
+ " 255: off (default)\n"
+ " 0: only keep level stats\n"
+ " 1: adaptively set alt-ref "
+ "distance and column tile limit based on picture size, and keep"
+ " level stats\n"
+ " 10: level 1.0 11: level 1.1 "
+ "... 62: level 6.2");
static const arg_def_t row_mt =
ARG_DEF(NULL, "row-mt", 1,
@@ -539,46 +552,54 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
static const arg_def_t *no_args[] = { NULL };
-void usage_exit(void) {
+void show_help(FILE *fout, int shorthelp) {
int i;
const int num_encoder = get_vpx_encoder_count();
- fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
+ fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
exec_name);
- fprintf(stderr, "\nOptions:\n");
- arg_show_usage(stderr, main_args);
- fprintf(stderr, "\nEncoder Global Options:\n");
- arg_show_usage(stderr, global_args);
- fprintf(stderr, "\nRate Control Options:\n");
- arg_show_usage(stderr, rc_args);
- fprintf(stderr, "\nTwopass Rate Control Options:\n");
- arg_show_usage(stderr, rc_twopass_args);
- fprintf(stderr, "\nKeyframe Placement Options:\n");
- arg_show_usage(stderr, kf_args);
+ if (shorthelp) {
+ fprintf(fout, "Use --help to see the full list of options.\n");
+ return;
+ }
+
+ fprintf(fout, "\nOptions:\n");
+ arg_show_usage(fout, main_args);
+ fprintf(fout, "\nEncoder Global Options:\n");
+ arg_show_usage(fout, global_args);
+ fprintf(fout, "\nRate Control Options:\n");
+ arg_show_usage(fout, rc_args);
+ fprintf(fout, "\nTwopass Rate Control Options:\n");
+ arg_show_usage(fout, rc_twopass_args);
+ fprintf(fout, "\nKeyframe Placement Options:\n");
+ arg_show_usage(fout, kf_args);
#if CONFIG_VP8_ENCODER
- fprintf(stderr, "\nVP8 Specific Options:\n");
- arg_show_usage(stderr, vp8_args);
+ fprintf(fout, "\nVP8 Specific Options:\n");
+ arg_show_usage(fout, vp8_args);
#endif
#if CONFIG_VP9_ENCODER
- fprintf(stderr, "\nVP9 Specific Options:\n");
- arg_show_usage(stderr, vp9_args);
+ fprintf(fout, "\nVP9 Specific Options:\n");
+ arg_show_usage(fout, vp9_args);
#endif
- fprintf(stderr,
+ fprintf(fout,
"\nStream timebase (--timebase):\n"
" The desired precision of timestamps in the output, expressed\n"
" in fractional seconds. Default is 1/1000.\n");
- fprintf(stderr, "\nIncluded encoders:\n\n");
+ fprintf(fout, "\nIncluded encoders:\n\n");
for (i = 0; i < num_encoder; ++i) {
const VpxInterface *const encoder = get_vpx_encoder_by_index(i);
const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
- fprintf(stderr, " %-6s - %s %s\n", encoder->name,
+ fprintf(fout, " %-6s - %s %s\n", encoder->name,
vpx_codec_iface_name(encoder->codec_interface()), defstr);
}
- fprintf(stderr, "\n ");
- fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
+ fprintf(fout, "\n ");
+ fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
+void usage_exit(void) {
+ show_help(stderr, 1);
exit(EXIT_FAILURE);
}
@@ -893,7 +914,10 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
arg.argv_step = 1;
- if (arg_match(&arg, &codecarg, argi)) {
+ if (arg_match(&arg, &help, argi)) {
+ show_help(stdout, 0);
+ exit(EXIT_SUCCESS);
+ } else if (arg_match(&arg, &codecarg, argi)) {
global->codec = get_vpx_encoder_by_name(arg.val);
if (!global->codec)
die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -1229,6 +1253,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
if (global->passes < 2)
warn("option %s ignored in one-pass mode.\n", arg.name);
+ } else if (arg_match(&arg, &corpus_complexity, argi)) {
+ config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg);
+
+ if (global->passes < 2)
+ warn("option %s ignored in one-pass mode.\n", arg.name);
} else if (arg_match(&arg, &kf_min_dist, argi)) {
config->cfg.kf_min_dist = arg_parse_uint(&arg);
} else if (arg_match(&arg, &kf_max_dist, argi)) {
@@ -1425,6 +1454,7 @@ static void show_stream_config(struct stream_state *stream,
SHOW(rc_2pass_vbr_bias_pct);
SHOW(rc_2pass_vbr_minsection_pct);
SHOW(rc_2pass_vbr_maxsection_pct);
+ SHOW(rc_2pass_vbr_corpus_complexity);
SHOW(kf_mode);
SHOW(kf_min_dist);
SHOW(kf_max_dist);
@@ -1889,8 +1919,6 @@ int main(int argc, const char **argv_) {
memset(&input, 0, sizeof(input));
exec_name = argv_[0];
- if (argc < 3) usage_exit();
-
/* Setup default input stream settings */
input.framerate.numerator = 30;
input.framerate.denominator = 1;
@@ -1904,6 +1932,8 @@ int main(int argc, const char **argv_) {
argv = argv_dup(argc - 1, argv_ + 1);
parse_global_config(&global, argv);
+ if (argc < 3) usage_exit();
+
switch (global.color_type) {
case I420: input.fmt = VPX_IMG_FMT_I420; break;
case I422: input.fmt = VPX_IMG_FMT_I422; break;
@@ -1937,7 +1967,10 @@ int main(int argc, const char **argv_) {
/* Handle non-option arguments */
input.filename = argv[0];
- if (!input.filename) usage_exit();
+ if (!input.filename) {
+ fprintf(stderr, "No input file specified!\n");
+ usage_exit();
+ }
/* Decide if other chroma subsamplings than 4:2:0 are supported */
if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0;
diff --git a/libvpx/y4minput.c b/libvpx/y4minput.c
index acf7d69fe..1de636cc0 100644
--- a/libvpx/y4minput.c
+++ b/libvpx/y4minput.c
@@ -195,26 +195,29 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
window.*/
for (x = 0; x < OC_MINI(_c_w, 2); x++) {
_dst[x] = (unsigned char)OC_CLAMPI(
- 0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
- 35 * _src[OC_MINI(x + 1, _c_w - 1)] -
- 9 * _src[OC_MINI(x + 2, _c_w - 1)] +
- _src[OC_MINI(x + 3, _c_w - 1)] + 64) >>
- 7,
+ 0,
+ (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+ 35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+ 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] +
+ 64) >>
+ 7,
255);
}
for (; x < _c_w - 3; x++) {
_dst[x] = (unsigned char)OC_CLAMPI(
- 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
- 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
- 7,
+ 0,
+ (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+ 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+ 7,
255);
}
for (; x < _c_w; x++) {
_dst[x] = (unsigned char)OC_CLAMPI(
- 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
- 35 * _src[OC_MINI(x + 1, _c_w - 1)] -
- 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
- 7,
+ 0,
+ (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+ 35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+ 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+ 7,
255);
}
_dst += _c_w;
@@ -314,28 +317,31 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
for (x = 0; x < c_w; x++) {
for (y = 0; y < OC_MINI(c_h, 3); y++) {
_dst[y * c_w] = (unsigned char)OC_CLAMPI(
- 0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
- 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
- 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
- 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
- 7,
+ 0,
+ (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+ 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+ 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+ 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+ 7,
255);
}
for (; y < c_h - 2; y++) {
_dst[y * c_w] = (unsigned char)OC_CLAMPI(
- 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
- 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
- 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
- 7,
+ 0,
+ (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+ 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+ 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+ 7,
255);
}
for (; y < c_h; y++) {
_dst[y * c_w] = (unsigned char)OC_CLAMPI(
- 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
- 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
- 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
- 4 * tmp[(c_h - 1) * c_w] + 64) >>
- 7,
+ 0,
+ (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+ 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+ 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+ 4 * tmp[(c_h - 1) * c_w] + 64) >>
+ 7,
255);
}
_dst++;
@@ -361,10 +367,11 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
}
for (; y < c_h - 3; y++) {
_dst[y * c_w] = (unsigned char)OC_CLAMPI(
- 0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
- 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
- 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
- 7,
+ 0,
+ (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+ 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+ 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+ 7,
255);
}
for (; y < c_h; y++) {
@@ -404,18 +411,20 @@ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
for (x = 0; x < _c_w; x++) {
for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
_dst[(y >> 1) * _c_w] =
- OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
- 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
- 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
- 7,
+ OC_CLAMPI(0,
+ (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+ 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+ 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+ 7,
255);
}
for (; y < _c_h - 3; y += 2) {
_dst[(y >> 1) * _c_w] =
- OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
- 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
- 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
- 7,
+ OC_CLAMPI(0,
+ (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+ 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+ 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+ 7,
255);
}
for (; y < _c_h; y += 2) {
@@ -642,33 +651,38 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
4-tap Mitchell window.*/
for (x = 0; x < OC_MINI(c_w, 1); x++) {
tmp[x << 1] = (unsigned char)OC_CLAMPI(
- 0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
- _aux[OC_MINI(2, c_w - 1)] + 64) >>
- 7,
+ 0,
+ (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+ _aux[OC_MINI(2, c_w - 1)] + 64) >>
+ 7,
255);
tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
- 0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
- 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
- 7,
+ 0,
+ (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+ 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+ 7,
255);
}
for (; x < c_w - 2; x++) {
tmp[x << 1] =
- (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] +
- 18 * _aux[x + 1] - _aux[x + 2] + 64) >>
- 7,
+ (unsigned char)OC_CLAMPI(0,
+ (_aux[x - 1] + 110 * _aux[x] +
+ 18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+ 7,
255);
tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
- 0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
- 5 * _aux[x + 2] + 64) >>
- 7,
+ 0,
+ (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+ 5 * _aux[x + 2] + 64) >>
+ 7,
255);
}
for (; x < c_w; x++) {
tmp[x << 1] = (unsigned char)OC_CLAMPI(
- 0, (_aux[x - 1] + 110 * _aux[x] +
- 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >>
- 7,
+ 0,
+ (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] -
+ _aux[c_w - 1] + 64) >>
+ 7,
255);
if ((x << 1 | 1) < dst_c_w) {
tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
@@ -718,27 +732,29 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
/*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
for (y = 0; y < c_h; y++) {
for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
- tmp[x >> 1] =
- OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
- 17 * _aux[OC_MINI(2, c_w - 1)] +
- 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
- 7,
- 255);
+ tmp[x >> 1] = OC_CLAMPI(0,
+ (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+ 17 * _aux[OC_MINI(2, c_w - 1)] +
+ 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+ 7,
+ 255);
}
for (; x < c_w - 3; x += 2) {
- tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) -
- 17 * (_aux[x - 1] + _aux[x + 2]) +
- 78 * (_aux[x] + _aux[x + 1]) + 64) >>
- 7,
+ tmp[x >> 1] = OC_CLAMPI(0,
+ (3 * (_aux[x - 2] + _aux[x + 3]) -
+ 17 * (_aux[x - 1] + _aux[x + 2]) +
+ 78 * (_aux[x] + _aux[x + 1]) + 64) >>
+ 7,
255);
}
for (; x < c_w; x += 2) {
- tmp[x >> 1] = OC_CLAMPI(
- 0, (3 * (_aux[x - 2] + _aux[c_w - 1]) -
- 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
- 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
- 7,
- 255);
+ tmp[x >> 1] =
+ OC_CLAMPI(0,
+ (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+ 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+ 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+ 7,
+ 255);
}
tmp += dst_c_w;
_aux += c_w;
diff --git a/libwebm/Android.bp b/libwebm/Android.bp
index a1d335ebc..753746e84 100644
--- a/libwebm/Android.bp
+++ b/libwebm/Android.bp
@@ -1,6 +1,7 @@
cc_library_static {
name: "libwebm",
srcs: ["mkvparser/mkvparser.cc"],
+ cflags: ["-Wall", "-Werror"],
export_include_dirs: ["."],
sanitize: {
cfi: true,