aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2024-02-02 23:48:58 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2024-02-02 23:48:58 +0000
commit9e49d764caa999e09859b9898dd728bf86bddac9 (patch)
tree2b266e9c9a69553c63db3d631894f734ef67e471
parent606eda181f66a9e272a1786ec00e12a816c97a7a (diff)
parentd02668953208ce505cb83cfab205a870fecfa0ad (diff)
downloadlibaom-simpleperf-release.tar.gz
Snap for 11400057 from d02668953208ce505cb83cfab205a870fecfa0ad to simpleperf-releasesimpleperf-release
Change-Id: I30e31dd9e1990ad21056575f05e4ed02b460adf9
-rw-r--r--.mailmap24
-rw-r--r--AUTHORS11
-rw-r--r--Android.bp72
-rw-r--r--Android.bp.in4
-rw-r--r--Android.mk1
-rw-r--r--CHANGELOG84
-rw-r--r--CMakeLists.txt40
-rw-r--r--METADATA8
-rw-r--r--README.android8
-rw-r--r--README.md1
-rw-r--r--README.version3
-rw-r--r--aom/aom_encoder.h6
-rw-r--r--aom/aomcx.h25
-rw-r--r--aom_dsp/aom_dsp.cmake57
-rw-r--r--aom_dsp/aom_dsp_common.h4
-rwxr-xr-xaom_dsp/aom_dsp_rtcd_defs.pl1321
-rw-r--r--aom_dsp/arm/aom_convolve8_neon.c843
-rw-r--r--aom_dsp/arm/aom_convolve8_neon_dotprod.c464
-rw-r--r--aom_dsp/arm/aom_convolve8_neon_i8mm.c413
-rw-r--r--aom_dsp/arm/aom_convolve_copy_neon.c101
-rw-r--r--aom_dsp/arm/avg_neon.c214
-rw-r--r--aom_dsp/arm/avg_pred_neon.c100
-rw-r--r--aom_dsp/arm/blend_a64_mask_neon.c805
-rw-r--r--aom_dsp/arm/blend_neon.h125
-rw-r--r--aom_dsp/arm/blk_sse_sum_neon.c124
-rw-r--r--aom_dsp/arm/dist_wtd_avg_neon.h65
-rw-r--r--aom_dsp/arm/fwd_txfm_neon.c16
-rw-r--r--aom_dsp/arm/hadamard_neon.c138
-rw-r--r--aom_dsp/arm/highbd_avg_pred_neon.c190
-rw-r--r--aom_dsp/arm/highbd_blend_a64_hmask_neon.c97
-rw-r--r--aom_dsp/arm/highbd_blend_a64_mask_neon.c473
-rw-r--r--aom_dsp/arm/highbd_blend_a64_vmask_neon.c105
-rw-r--r--aom_dsp/arm/highbd_convolve8_neon.c363
-rw-r--r--aom_dsp/arm/highbd_hadamard_neon.c2
-rw-r--r--aom_dsp/arm/highbd_intrapred_neon.c3
-rw-r--r--aom_dsp/arm/highbd_loopfilter_neon.c40
-rw-r--r--aom_dsp/arm/highbd_masked_sad_neon.c354
-rw-r--r--aom_dsp/arm/highbd_obmc_sad_neon.c211
-rw-r--r--aom_dsp/arm/highbd_obmc_variance_neon.c369
-rw-r--r--aom_dsp/arm/highbd_quantize_neon.c13
-rw-r--r--aom_dsp/arm/highbd_sad_neon.c224
-rw-r--r--aom_dsp/arm/highbd_sadxd_neon.c (renamed from aom_dsp/arm/highbd_sad4d_neon.c)257
-rw-r--r--aom_dsp/arm/highbd_sse_neon.c284
-rw-r--r--aom_dsp/arm/highbd_subpel_variance_neon.c1497
-rw-r--r--aom_dsp/arm/highbd_variance_neon.c117
-rw-r--r--aom_dsp/arm/highbd_variance_neon_dotprod.c92
-rw-r--r--aom_dsp/arm/intrapred_neon.c54
-rw-r--r--aom_dsp/arm/loopfilter_neon.c56
-rw-r--r--aom_dsp/arm/masked_sad4d_neon.c25
-rw-r--r--aom_dsp/arm/masked_sad_neon.c25
-rw-r--r--aom_dsp/arm/mem_neon.h244
-rw-r--r--aom_dsp/arm/sad_neon.c503
-rw-r--r--aom_dsp/arm/sad_neon_dotprod.c530
-rw-r--r--aom_dsp/arm/sadxd_neon.c174
-rw-r--r--aom_dsp/arm/sadxd_neon_dotprod.c289
-rw-r--r--aom_dsp/arm/sse_neon.c444
-rw-r--r--aom_dsp/arm/sse_neon_dotprod.c223
-rw-r--r--aom_dsp/arm/subpel_variance_neon.c460
-rw-r--r--aom_dsp/arm/sum_neon.h98
-rw-r--r--aom_dsp/arm/sum_squares_neon.c126
-rw-r--r--aom_dsp/arm/sum_squares_neon_dotprod.c154
-rw-r--r--aom_dsp/arm/transpose_neon.h440
-rw-r--r--aom_dsp/arm/variance_neon.c370
-rw-r--r--aom_dsp/arm/variance_neon_dotprod.c314
-rw-r--r--aom_dsp/avg.c6
-rw-r--r--aom_dsp/entenc.h7
-rw-r--r--aom_dsp/fft.c1
-rw-r--r--aom_dsp/fft_common.h6
-rw-r--r--aom_dsp/flow_estimation/arm/disflow_neon.c368
-rw-r--r--aom_dsp/flow_estimation/corner_detect.c28
-rw-r--r--aom_dsp/flow_estimation/corner_detect.h6
-rw-r--r--aom_dsp/flow_estimation/corner_match.c46
-rw-r--r--aom_dsp/flow_estimation/corner_match.h11
-rw-r--r--aom_dsp/flow_estimation/disflow.c93
-rw-r--r--aom_dsp/flow_estimation/disflow.h3
-rw-r--r--aom_dsp/flow_estimation/flow_estimation.c12
-rw-r--r--aom_dsp/flow_estimation/flow_estimation.h2
-rw-r--r--aom_dsp/flow_estimation/ransac.c9
-rw-r--r--aom_dsp/flow_estimation/ransac.h2
-rw-r--r--aom_dsp/flow_estimation/x86/disflow_sse4.c112
-rw-r--r--aom_dsp/pyramid.c25
-rw-r--r--aom_dsp/pyramid.h2
-rw-r--r--aom_dsp/quantize.c1
-rw-r--r--aom_dsp/sad.c48
-rw-r--r--aom_dsp/simd/v64_intrinsics_arm.h17
-rw-r--r--aom_dsp/sse.c7
-rw-r--r--aom_dsp/variance.c8
-rw-r--r--aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c261
-rw-r--r--aom_dsp/x86/avg_intrin_sse2.c24
-rw-r--r--aom_dsp/x86/fft_sse2.c6
-rw-r--r--aom_dsp/x86/highbd_sad_avx2.c30
-rw-r--r--aom_dsp/x86/highbd_variance_avx2.c12
-rw-r--r--aom_dsp/x86/obmc_variance_sse4.c12
-rw-r--r--aom_dsp/x86/sse_avx2.c5
-rw-r--r--aom_dsp/x86/sse_sse4.c2
-rw-r--r--aom_dsp/x86/synonyms_avx2.h4
-rw-r--r--aom_ports/aarch32_cpudetect.c86
-rw-r--r--aom_ports/aarch64_cpudetect.c188
-rw-r--r--aom_ports/aom_ports.cmake11
-rw-r--r--aom_ports/arm.h16
-rw-r--r--aom_ports/arm_cpudetect.c158
-rw-r--r--aom_ports/arm_cpudetect.h54
-rw-r--r--aom_ports/bitops.h35
-rw-r--r--aom_ports/mem.h10
-rw-r--r--aom_scale/generic/yv12config.c2
-rw-r--r--aom_util/aom_thread.c22
-rw-r--r--aom_util/aom_thread.h14
-rw-r--r--apps/aomenc.c5
-rw-r--r--av1/arg_defs.c2
-rw-r--r--av1/arg_defs.h1
-rw-r--r--av1/av1.cmake106
-rw-r--r--av1/av1_cx_iface.c98
-rw-r--r--av1/av1_dx_iface.c1
-rw-r--r--av1/common/alloccommon.c22
-rw-r--r--av1/common/arm/av1_inv_txfm_neon.c62
-rw-r--r--av1/common/arm/blend_a64_hmask_neon.c152
-rw-r--r--av1/common/arm/blend_a64_vmask_neon.c159
-rw-r--r--av1/common/arm/cdef_block_neon.c873
-rw-r--r--av1/common/arm/compound_convolve_neon.c2731
-rw-r--r--av1/common/arm/compound_convolve_neon.h1172
-rw-r--r--av1/common/arm/compound_convolve_neon_dotprod.c679
-rw-r--r--av1/common/arm/compound_convolve_neon_i8mm.c618
-rw-r--r--av1/common/arm/convolve_neon.c4726
-rw-r--r--av1/common/arm/convolve_neon.h1027
-rw-r--r--av1/common/arm/convolve_neon_dotprod.c797
-rw-r--r--av1/common/arm/convolve_neon_i8mm.c706
-rw-r--r--av1/common/arm/highbd_compound_convolve_neon.c2031
-rw-r--r--av1/common/arm/highbd_convolve_horiz_rs_neon.c273
-rw-r--r--av1/common/arm/highbd_convolve_neon.c3169
-rw-r--r--av1/common/arm/highbd_convolve_neon.h418
-rw-r--r--av1/common/arm/highbd_convolve_scale_neon.c552
-rw-r--r--av1/common/arm/highbd_inv_txfm_neon.c2
-rw-r--r--av1/common/arm/highbd_reconinter_neon.c330
-rw-r--r--av1/common/arm/highbd_reconintra_neon.c241
-rw-r--r--av1/common/arm/highbd_warp_plane_neon.c560
-rw-r--r--av1/common/arm/highbd_wiener_convolve_neon.c403
-rw-r--r--av1/common/arm/jnt_convolve_neon.c5336
-rw-r--r--av1/common/arm/reconinter_neon.c231
-rw-r--r--av1/common/arm/reconintra_neon.c184
-rw-r--r--av1/common/arm/resize_neon.c378
-rw-r--r--av1/common/arm/selfguided_neon.c35
-rw-r--r--av1/common/arm/warp_plane_neon.c929
-rw-r--r--av1/common/arm/warp_plane_neon.h378
-rw-r--r--av1/common/arm/warp_plane_neon_i8mm.c291
-rw-r--r--av1/common/arm/warp_plane_sve.c297
-rw-r--r--av1/common/arm/wiener_convolve_neon.c582
-rw-r--r--av1/common/av1_loopfilter.h3
-rw-r--r--av1/common/av1_rtcd_defs.pl108
-rw-r--r--av1/common/av1_txfm.c167
-rw-r--r--av1/common/av1_txfm.h28
-rw-r--r--av1/common/cdef.c19
-rw-r--r--av1/common/cdef.h3
-rw-r--r--av1/common/cdef_block.h3
-rw-r--r--av1/common/cdef_block_simd.h8
-rw-r--r--av1/common/cfl.c4
-rw-r--r--av1/common/cfl.h4
-rw-r--r--av1/common/convolve.c147
-rw-r--r--av1/common/convolve.h15
-rw-r--r--av1/common/enums.h4
-rw-r--r--av1/common/reconintra.c298
-rw-r--r--av1/common/resize.c103
-rw-r--r--av1/common/resize.h12
-rw-r--r--av1/common/restoration.c504
-rw-r--r--av1/common/restoration.h121
-rw-r--r--av1/common/thread_common.c325
-rw-r--r--av1/common/thread_common.h33
-rw-r--r--av1/common/tile_common.c4
-rw-r--r--av1/common/tile_common.h4
-rw-r--r--av1/common/warped_motion.c167
-rw-r--r--av1/common/warped_motion.h88
-rw-r--r--av1/common/x86/highbd_wiener_convolve_avx2.c2
-rw-r--r--av1/common/x86/highbd_wiener_convolve_ssse3.c2
-rw-r--r--av1/common/x86/intra_edge_sse4.c128
-rw-r--r--av1/common/x86/selfguided_avx2.c14
-rw-r--r--av1/common/x86/selfguided_sse4.c14
-rw-r--r--av1/common/x86/warp_plane_avx2.c110
-rw-r--r--av1/common/x86/warp_plane_sse2.c88
-rw-r--r--av1/common/x86/wiener_convolve_avx2.c2
-rw-r--r--av1/common/x86/wiener_convolve_sse2.c2
-rw-r--r--av1/decoder/decodeframe.c11
-rw-r--r--av1/decoder/decodemv.c18
-rw-r--r--av1/encoder/allintra_vis.c39
-rw-r--r--av1/encoder/allintra_vis.h2
-rw-r--r--av1/encoder/aq_cyclicrefresh.c13
-rw-r--r--av1/encoder/arm/crc32/hash_arm_crc32.c (renamed from av1/encoder/arm/crc32/hash_crc32.c)9
-rw-r--r--av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c5853
-rw-r--r--av1/encoder/arm/neon/encodetxb_neon.c17
-rw-r--r--av1/encoder/arm/neon/highbd_fwd_txfm_neon.c5354
-rw-r--r--av1/encoder/arm/neon/highbd_pickrst_neon.c741
-rw-r--r--av1/encoder/arm/neon/highbd_rdopt_neon.c49
-rw-r--r--av1/encoder/arm/neon/highbd_temporal_filter_neon.c562
-rw-r--r--av1/encoder/arm/neon/pickrst_neon.c1261
-rw-r--r--av1/encoder/arm/neon/pickrst_neon.h281
-rw-r--r--av1/encoder/arm/neon/picksrt_neon.c150
-rw-r--r--av1/encoder/arm/neon/reconinter_enc_neon.c167
-rw-r--r--av1/encoder/arm/neon/shift_neon.h49
-rw-r--r--av1/encoder/arm/neon/temporal_filter_neon.c473
-rw-r--r--av1/encoder/arm/neon/temporal_filter_neon_dotprod.c299
-rw-r--r--av1/encoder/arm/neon/txfm_neon.h26
-rw-r--r--av1/encoder/arm/neon/wedge_utils_neon.c54
-rw-r--r--av1/encoder/av1_noise_estimate.c11
-rw-r--r--av1/encoder/bitstream.c48
-rw-r--r--av1/encoder/bitstream.h3
-rw-r--r--av1/encoder/block.h35
-rw-r--r--av1/encoder/compound_type.c10
-rw-r--r--av1/encoder/context_tree.c6
-rw-r--r--av1/encoder/encode_strategy.c10
-rw-r--r--av1/encoder/encodeframe.c181
-rw-r--r--av1/encoder/encodeframe_utils.c32
-rw-r--r--av1/encoder/encodeframe_utils.h36
-rw-r--r--av1/encoder/encodemb.c30
-rw-r--r--av1/encoder/encodemb.h1
-rw-r--r--av1/encoder/encoder.c197
-rw-r--r--av1/encoder/encoder.h99
-rw-r--r--av1/encoder/encoder_alloc.h173
-rw-r--r--av1/encoder/encoder_utils.c40
-rw-r--r--av1/encoder/encoder_utils.h4
-rw-r--r--av1/encoder/ethread.c731
-rw-r--r--av1/encoder/ethread.h14
-rw-r--r--av1/encoder/firstpass.c90
-rw-r--r--av1/encoder/firstpass.h2
-rw-r--r--av1/encoder/global_motion.c384
-rw-r--r--av1/encoder/global_motion.h74
-rw-r--r--av1/encoder/global_motion_facade.c201
-rw-r--r--av1/encoder/global_motion_facade.h30
-rw-r--r--av1/encoder/hash_motion.c17
-rw-r--r--av1/encoder/hybrid_fwd_txfm.c46
-rw-r--r--av1/encoder/interp_search.c3
-rw-r--r--av1/encoder/interp_search.h5
-rw-r--r--av1/encoder/intra_mode_search.c32
-rw-r--r--av1/encoder/intra_mode_search_utils.h11
-rw-r--r--av1/encoder/mcomp.c399
-rw-r--r--av1/encoder/mcomp.h23
-rw-r--r--av1/encoder/motion_search_facade.c151
-rw-r--r--av1/encoder/motion_search_facade.h61
-rw-r--r--av1/encoder/nonrd_opt.h36
-rw-r--r--av1/encoder/nonrd_pickmode.c331
-rw-r--r--av1/encoder/palette.c19
-rw-r--r--av1/encoder/partition_search.c291
-rw-r--r--av1/encoder/partition_search.h3
-rw-r--r--av1/encoder/partition_strategy.c63
-rw-r--r--av1/encoder/pass2_strategy.c66
-rw-r--r--av1/encoder/pass2_strategy.h8
-rw-r--r--av1/encoder/pickcdef.c109
-rw-r--r--av1/encoder/pickcdef.h29
-rw-r--r--av1/encoder/pickrst.c667
-rw-r--r--av1/encoder/pickrst.h28
-rw-r--r--av1/encoder/ratectrl.c68
-rw-r--r--av1/encoder/ratectrl.h2
-rw-r--r--av1/encoder/rd.c7
-rw-r--r--av1/encoder/rdopt.c117
-rw-r--r--av1/encoder/rdopt_utils.h6
-rw-r--r--av1/encoder/saliency_map.c2
-rw-r--r--av1/encoder/speed_features.c143
-rw-r--r--av1/encoder/speed_features.h87
-rw-r--r--av1/encoder/svc_layercontext.c30
-rw-r--r--av1/encoder/svc_layercontext.h8
-rw-r--r--av1/encoder/temporal_filter.c247
-rw-r--r--av1/encoder/temporal_filter.h30
-rw-r--r--av1/encoder/tpl_model.c314
-rw-r--r--av1/encoder/tpl_model.h64
-rw-r--r--av1/encoder/tune_butteraugli.c7
-rw-r--r--av1/encoder/tune_vmaf.c13
-rw-r--r--av1/encoder/tx_search.c28
-rw-r--r--av1/encoder/tx_search.h21
-rw-r--r--av1/encoder/var_based_part.c86
-rw-r--r--av1/encoder/x86/ml_avx2.c240
-rw-r--r--av1/encoder/x86/ml_sse3.c36
-rw-r--r--av1/encoder/x86/ml_sse3.h29
-rw-r--r--av1/ratectrl_rtc.cc48
-rw-r--r--av1/ratectrl_rtc.h26
-rw-r--r--build/cmake/aom_config_defaults.cmake23
-rw-r--r--build/cmake/aom_configure.cmake9
-rw-r--r--build/cmake/aom_install.cmake2
-rw-r--r--build/cmake/aom_optimization.cmake2
-rw-r--r--build/cmake/cpu.cmake67
-rw-r--r--build/cmake/pkg_config.cmake7
-rwxr-xr-xbuild/cmake/rtcd.pl5
-rw-r--r--build/cmake/toolchains/android.cmake2
-rw-r--r--build/cmake/toolchains/arm-ios-common.cmake3
-rw-r--r--build/cmake/toolchains/arm64-linux-clang.cmake30
-rw-r--r--build/cmake/toolchains/arm64-linux-gcc.cmake3
-rw-r--r--build/cmake/toolchains/arm64-mingw-gcc.cmake3
-rw-r--r--build/cmake/toolchains/armv7-linux-gcc.cmake3
-rw-r--r--build/cmake/toolchains/i686-linux-gcc.cmake34
-rw-r--r--common/tools_common.c4
-rw-r--r--config/arm/config/aom_config.asm5
-rw-r--r--config/arm/config/aom_config.c4
-rw-r--r--config/arm/config/aom_config.h5
-rw-r--r--config/arm/config/aom_dsp_rtcd.h2331
-rw-r--r--config/arm/config/av1_rtcd.h151
-rw-r--r--config/arm64/config/aom_config.asm5
-rw-r--r--config/arm64/config/aom_config.c4
-rw-r--r--config/arm64/config/aom_config.h5
-rw-r--r--config/arm64/config/aom_dsp_rtcd.h2331
-rw-r--r--config/arm64/config/av1_rtcd.h151
-rw-r--r--config/config/aom_version.h12
-rw-r--r--config/riscv64/config/aom_config.asm5
-rw-r--r--config/riscv64/config/aom_config.c2
-rw-r--r--config/riscv64/config/aom_config.h5
-rw-r--r--config/riscv64/config/aom_dsp_rtcd.h564
-rw-r--r--config/riscv64/config/av1_rtcd.h63
-rw-r--r--config/x86/config/aom_config.asm3
-rw-r--r--config/x86/config/aom_config.c4
-rw-r--r--config/x86/config/aom_config.h5
-rw-r--r--config/x86/config/aom_dsp_rtcd.h736
-rw-r--r--config/x86/config/av1_rtcd.h72
-rw-r--r--config/x86_64/config/aom_config.asm3
-rw-r--r--config/x86_64/config/aom_config.c2
-rw-r--r--config/x86_64/config/aom_config.h5
-rw-r--r--config/x86_64/config/aom_dsp_rtcd.h736
-rw-r--r--config/x86_64/config/av1_rtcd.h72
-rw-r--r--docs.cmake2
-rw-r--r--examples/lightfield_tile_list_decoder.c2
-rw-r--r--examples/resize_util.c132
-rw-r--r--examples/svc_encoder_rtc.cc271
-rwxr-xr-xgenerate_config.sh13
-rw-r--r--libaom_blocklist.txt2
-rw-r--r--test/active_map_test.cc8
-rw-r--r--test/allintra_end_to_end_test.cc12
-rw-r--r--test/altref_test.cc26
-rw-r--r--test/aq_segment_test.cc8
-rw-r--r--test/arf_freq_test.cc12
-rw-r--r--test/av1_c_vs_simd_encode.sh535
-rw-r--r--test/av1_convolve_scale_test.cc17
-rw-r--r--test/av1_convolve_test.cc563
-rw-r--r--test/av1_encoder_parms_get_to_decoder.cc18
-rw-r--r--test/av1_ext_tile_test.cc14
-rw-r--r--test/av1_external_partition_test.cc28
-rw-r--r--test/av1_fwd_txfm1d_test.cc4
-rw-r--r--test/av1_fwd_txfm2d_test.cc4
-rw-r--r--test/av1_highbd_iht_test.cc8
-rw-r--r--test/av1_horz_only_frame_superres_test.cc17
-rw-r--r--test/av1_inv_txfm2d_test.cc6
-rw-r--r--test/av1_k_means_test.cc20
-rw-r--r--test/av1_nn_predict_test.cc33
-rw-r--r--test/av1_quantize_test.cc6
-rw-r--r--test/av1_round_shift_array_test.cc9
-rw-r--r--test/av1_softmax_test.cc2
-rw-r--r--test/av1_temporal_denoiser_test.cc6
-rw-r--r--test/av1_wedge_utils_test.cc10
-rw-r--r--test/avg_test.cc28
-rw-r--r--test/blend_a64_mask_1d_test.cc88
-rw-r--r--test/blend_a64_mask_test.cc232
-rw-r--r--test/block_test.cc14
-rw-r--r--test/borders_test.cc10
-rw-r--r--test/cdef_test.cc253
-rw-r--r--test/cfl_test.cc28
-rw-r--r--test/cnn_test.cc2
-rw-r--r--test/codec_factory.h24
-rw-r--r--test/coding_path_sync.cc26
-rw-r--r--test/comp_avg_pred_test.cc164
-rw-r--r--test/comp_avg_pred_test.h209
-rw-r--r--test/comp_mask_pred_test.cc174
-rw-r--r--test/convolve_round_test.cc186
-rw-r--r--test/convolve_test.cc51
-rw-r--r--test/corner_match_test.cc9
-rw-r--r--test/cpu_speed_test.cc12
-rw-r--r--test/cpu_used_firstpass_test.cc12
-rw-r--r--test/datarate_test.cc108
-rw-r--r--test/datarate_test.h10
-rw-r--r--test/decode_multithreaded_test.cc10
-rw-r--r--test/decode_perf_test.cc21
-rw-r--r--test/decode_scalability_test.cc2
-rw-r--r--test/decode_test_driver.h2
-rw-r--r--test/disflow_test.cc122
-rw-r--r--test/dr_prediction_test.cc2
-rw-r--r--test/dropframe_encode_test.cc6
-rw-r--r--test/encode_api_test.cc336
-rw-r--r--test/encode_perf_test.cc17
-rw-r--r--test/encode_small_width_height_test.cc96
-rw-r--r--test/encode_test_driver.h2
-rw-r--r--test/encodemb_test.cc2
-rw-r--r--test/encodetxb_test.cc9
-rw-r--r--test/end_to_end_psnr_test.cc12
-rw-r--r--test/end_to_end_qmpsnr_test.cc2
-rw-r--r--test/end_to_end_ssim_test.cc2
-rw-r--r--test/error_block_test.cc14
-rw-r--r--test/error_resilience_test.cc32
-rw-r--r--test/ethread_test.cc36
-rwxr-xr-xtest/examples.sh2
-rw-r--r--test/external_frame_buffer_test.cc17
-rw-r--r--test/fdct4x4_test.cc9
-rw-r--r--test/fft_test.cc8
-rw-r--r--test/film_grain_table_test.cc85
-rw-r--r--test/filterintra_test.cc6
-rw-r--r--test/forced_max_frame_width_height_test.cc5
-rw-r--r--test/frame_error_test.cc164
-rw-r--r--test/frame_parallel_enc_test.cc12
-rw-r--r--test/frame_size_tests.cc30
-rw-r--r--test/function_equivalence_test.h6
-rw-r--r--test/fwht4x4_test.cc9
-rw-r--r--test/gf_pyr_height_test.cc12
-rw-r--r--test/hadamard_test.cc36
-rw-r--r--test/hash_test.cc8
-rw-r--r--test/hbd_metrics_test.cc5
-rw-r--r--test/hiprec_convolve_test.cc8
-rw-r--r--test/hiprec_convolve_test_util.cc63
-rw-r--r--test/hiprec_convolve_test_util.h16
-rw-r--r--test/horver_correlation_test.cc4
-rw-r--r--test/horz_superres_test.cc36
-rw-r--r--test/intra_edge_test.cc228
-rw-r--r--test/intrapred_test.cc14
-rw-r--r--test/invalid_file_test.cc19
-rw-r--r--test/ivf_video_source.h14
-rw-r--r--test/kf_test.cc26
-rw-r--r--test/level_test.cc8
-rw-r--r--test/loopfilter_control_test.cc12
-rw-r--r--test/lossless_test.cc16
-rw-r--r--test/lpf_test.cc6
-rw-r--r--test/masked_sad_test.cc87
-rw-r--r--test/masked_variance_test.cc156
-rw-r--r--test/metadata_test.cc14
-rw-r--r--test/minmax_test.cc2
-rw-r--r--test/monochrome_test.cc14
-rw-r--r--test/motion_vector_test.cc8
-rw-r--r--test/mv_cost_test.cc17
-rw-r--r--test/noise_model_test.cc10
-rw-r--r--test/obmc_sad_test.cc32
-rw-r--r--test/obmc_variance_test.cc228
-rw-r--r--test/pickrst_test.cc63
-rw-r--r--test/postproc_filters_test.cc10
-rw-r--r--test/quant_test.cc22
-rw-r--r--test/quantize_func_test.cc10
-rw-r--r--test/ratectrl_rtc_test.cc85
-rw-r--r--test/reconinter_test.cc429
-rw-r--r--test/resize_test.cc113
-rw-r--r--test/rt_end_to_end_test.cc12
-rw-r--r--test/sad_test.cc506
-rw-r--r--test/sb_multipass_test.cc10
-rw-r--r--test/scalability_test.cc8
-rw-r--r--test/screen_content_test.cc14
-rw-r--r--test/selfguided_filter_test.cc42
-rw-r--r--test/sharpness_test.cc2
-rw-r--r--test/simd_impl.h6
-rw-r--r--test/sse_sum_test.cc6
-rw-r--r--test/still_picture_test.cc14
-rw-r--r--test/subtract_test.cc8
-rw-r--r--test/sum_squares_test.cc205
-rw-r--r--test/svc_datarate_test.cc573
-rw-r--r--test/temporal_filter_test.cc147
-rw-r--r--test/test-data.sha12
-rw-r--r--test/test.cmake67
-rw-r--r--test/test_data_util.cmake4
-rw-r--r--test/test_libaom.cc35
-rw-r--r--test/test_vector_test.cc11
-rw-r--r--test/tile_config_test.cc42
-rw-r--r--test/tile_independence_test.cc12
-rw-r--r--test/time_stamp_test.cc10
-rw-r--r--test/transform_test_base.h7
-rw-r--r--test/variance_test.cc869
-rw-r--r--test/video_source.h26
-rw-r--r--test/warp_filter_test.cc33
-rw-r--r--test/warp_filter_test_util.cc87
-rw-r--r--test/warp_filter_test_util.h12
-rw-r--r--test/webm_video_source.h14
-rw-r--r--test/wiener_test.cc31
-rw-r--r--test/y4m_test.cc6
-rw-r--r--test/y4m_video_source.h18
-rw-r--r--test/yuv_video_source.h18
-rw-r--r--third_party/fastfeat/README.libaom1
-rw-r--r--third_party/fastfeat/fast.c14
-rw-r--r--third_party/fastfeat/fast.h6
-rw-r--r--third_party/fastfeat/fast_9.c15
-rw-r--r--third_party/fastfeat/nonmax.c3
465 files changed, 54145 insertions, 33653 deletions
diff --git a/.mailmap b/.mailmap
index 7d31a708e..6d6e6302b 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,12 +1,16 @@
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
Adrian Grange <agrange@google.com>
-Aâ„“ex Converse <aconverse@google.com>
-Aâ„“ex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
+Alexander Bokov <alexanderbokov@google.com>
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
Alpha Lam <hclam@google.com> <hclam@chromium.org>
Andrey Norkin <anorkin@netflix.com>
Angie Chiang <angiebird@google.com>
Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Aâ„“ex Converse <aconverse@google.com>
+Aâ„“ex Converse <aconverse@google.com> <alex.converse@gmail.com>
Aasaipriya Chandran <aasaipriya.c@ittiam.com>
Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
Apurve Pandey <apurve.pandey@ittiam.com>
@@ -27,9 +31,10 @@ Fyodor Kyslov <kyslov@google.com>
Grant Hsu <grant.hsu@cidana.com> <grant.hsu@gmail.com>
Guillaume Martres <smarter@ubuntu.com>
Guillaume Martres <smarter@ubuntu.com> <gmartres@google.com>
-Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
Guillaume Martres <smarter@ubuntu.com> <gmartres@mozilla.com>
+Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
Hangyu Kuang <hkuang@google.com>
+Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
Hui Su <huisu@google.com>
Iole Moccagatta <iole.moccagatta@gmail.com>
Jacky Chen <jackychen@google.com>
@@ -40,13 +45,14 @@ Johann Koenig <johannkoenig@google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
John Koleszar <jkoleszar@google.com>
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
Kyle Siefring <siekyleb@amazon.com>
Kyle Siefring <siekyleb@amazon.com> <kylesiefring@gmail.com>
Lin Zheng <linzhen@google.com>
-Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
Luc Trudeau <luc@trud.ca>
Luc Trudeau <luc@trud.ca> <ltrudeau@mozilla.com>
Marco Paniconi <marpan@google.com>
@@ -56,6 +62,7 @@ Michael Horowitz <mhoro@webrtc.org> <mhoro@google.com>
Mingliang Chen <mlchen@google.com>
Monty Montgomery <cmontgomery@mozilla.com>
Mudassir Galaganath <mudassir.galaganath@ittiam.com>
+Narayan Kalaburgi <narayan.kalaburgi@ittiam.com>
Mudassir Galaganath <mudassir.galaganath@ittiam.com> Mudassir Galagnath
Nathan E. Egge <negge@mozilla.com>
Nathan E. Egge <negge@mozilla.com> <negge@dgql.org>
@@ -72,13 +79,14 @@ Ralph Giles <giles@xiph.org> <giles@mozilla.com>
Remya Prakasan <remya.prakasan@ittiam.com>
Roger Zhou <youzhou@microsoft.com>
Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
-Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
Ryan Lei <ryanlei@fb.com> <ryan.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
Ryan Lei <ryanlei@fb.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
Sai Deng <sdeng@google.com>
Sami Pietilä <samipietila@google.com>
Sarah Parker <sarahparker@google.com>
+Susanna D'Souza <susannad@google.com>
Tamar Levy <tamar.levy@intel.com>
Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
@@ -90,14 +98,16 @@ Tom Finegan <tomfinegan@google.com>
Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Vitalii Dziumenko <vdziumenko@luxoft.com> <vdziumenko@luxoft.corp-partner.google.com>
Wei-Ting Lin <weitinglin@google.com>
Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
Wenyao Liu <wenyao.liu@cidana.com>
Will Bresnahan <bill.wresnahan@gmail.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu.google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
-Yaowu Xu <yaowu@google.com> <Yaowu Xu>
-Yaowu Xu <yaowu@google.com> <yaowu.google.com>
Zhipin Deng <zhipin.deng@intel.com>
Zoe Liu <zoeliu@gmail.com> <zoeliu@google.com>
diff --git a/AUTHORS b/AUTHORS
index 79056a15c..509c0d1c9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -29,6 +29,7 @@ Anupam Pandey <anupam.pandey@ittiam.com>
Apurve Kumar Pandey <apurve.pandey@ittiam.com>
Arild Fuldseth <arilfuld@cisco.com>
Aron Rosenberg <arosenberg@logitech.com>
+Arpad Panyik <Arpad.Panyik@arm.com>
Arun Singh Negi <arun.negi@ittiam.com>
Attila Nagy <attilanagy@google.com>
Balaji Anandapadmanaban <balaji.anandapadmanaban@arm.com>
@@ -96,6 +97,7 @@ Hanno Böck <hanno@hboeck.de>
Harish Mahendrakar <harish.mahendrakar@ittiam.com>
Henrik Lundin <hlundin@google.com>
Hien Ho <hienho@google.com>
+Hirokazu Honda <hiroh@google.com>
Hui Su <huisu@google.com>
Ilie Halip <ilie.halip@gmail.com>
Ilya Brailovskiy <brailovs@lab126.com>
@@ -150,6 +152,7 @@ Larisa Markeeva <lmarkeeva@google.com>
Lauren Partin <lpartin@google.com>
Lawrence Velázquez <larryv@macports.org>
leolzhao <leolzhao@tencent.com>
+Leon Kollar <Leon.Kollar@arm.com>
L. E. Segovia <amy@amyspark.me>
Lester Lu <kslu@google.com>
liang zhao <leolzhao@tencent.com>
@@ -191,7 +194,6 @@ Morton Jonuschat <yabawock@gmail.com>
Mudassir Galaganath <mudassir.galaganath@ittiam.com>
Mufaddal Chakera <mufaddal.chakera@ittiam.com>
Narayan Kalaburgi <narayan.kalaburgi@ittiam.com>
-Narayan <narayan.kalaburgi@ittiam.com>
Nathan E. Egge <negge@mozilla.com>
Neeraj Gadgil <neeraj.gadgil@ittiam.com>
Neha Mary Thomas <neha.marythomas@ittiam.com>
@@ -233,6 +235,7 @@ Ronald S. Bultje <rsbultje@gmail.com>
Rostislav Pehlivanov <rpehlivanov@mozilla.com>
Ruiling Song <ruiling.song@intel.com>
Rui Ueyama <ruiu@google.com>
+Ruoyu Zhong <zhongruoyu@outlook.com>
Rupert Swarbrick <rupert.swarbrick@argondesign.com>
Ryan Lei <ryanlei@fb.com>
Ryan Overbeck <rover@google.com>
@@ -242,8 +245,10 @@ Salome Thirot <salome.thirot@arm.com>
Sami Boukortt <sboukortt@google.com>
Sami Pietilä <samipietila@google.com>
Samuel Thibault <samuel.thibault@ens-lyon.org>
+Samuthirika S <samuthirika.s@ittiam.com>
Sarah Parker <sarahparker@google.com>
Sasi Inguva <isasi@google.com>
+Satheesh Kumar <satheesh.kumar@ittiam.com>
Satish Kumar Suman <satish.suman@ittiam.com>
Scott Graham <scottmg@chromium.org>
Scott LaVarnway <slavarnway@google.com>
@@ -261,8 +266,9 @@ Soo-Chul Han <shan@vidyo.com>
Stanislav Vitvitskyy <vitvitskyy@google.com>
Stefan Holmer <holmer@google.com>
Steinar Midtskogen <stemidts@cisco.com>
+Steve Lhomme <robux4@gmail.com>
Suman Sunkara <sunkaras@google.com>
-susannad <susannad@google.com>
+Susanna D'Souza <susannad@google.com>
Taekhyun Kim <takim@nvidia.com>
Takanori MATSUURA <t.matsuu@gmail.com>
Tamar Levy <tamar.levy@intel.com>
@@ -290,7 +296,6 @@ Vincent Rabaud <vrabaud@google.com>
Vishesh <vishesh.garg@ittiam.com>
Vishnu Teja Manyam <vishnu.teja@ittiam.com>
Vitalii Dziumenko <vdziumenko@luxoft.com>
-Vitalii Dziumenko <vdziumenko@luxoft.corp-partner.google.com>
Wan-Teh Chang <wtc@google.com>
Wei-Ting Lin <weitinglin@google.com>
Wenyao Liu <wenyao.liu@cidana.com>
diff --git a/Android.bp b/Android.bp
index c7cb62131..bf42082c6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -29,10 +29,17 @@ aom_av1_common_intrin_neon = [
"av1/common/arm/blend_a64_vmask_neon.c",
"av1/common/arm/cdef_block_neon.c",
"av1/common/arm/cfl_neon.c",
+ "av1/common/arm/compound_convolve_neon.c",
"av1/common/arm/convolve_neon.c",
+ "av1/common/arm/highbd_compound_convolve_neon.c",
+ "av1/common/arm/highbd_convolve_horiz_rs_neon.c",
"av1/common/arm/highbd_convolve_neon.c",
+ "av1/common/arm/highbd_convolve_scale_neon.c",
"av1/common/arm/highbd_inv_txfm_neon.c",
- "av1/common/arm/jnt_convolve_neon.c",
+ "av1/common/arm/highbd_reconinter_neon.c",
+ "av1/common/arm/highbd_reconintra_neon.c",
+ "av1/common/arm/highbd_warp_plane_neon.c",
+ "av1/common/arm/highbd_wiener_convolve_neon.c",
"av1/common/arm/reconinter_neon.c",
"av1/common/arm/reconintra_neon.c",
"av1/common/arm/resize_neon.c",
@@ -41,13 +48,23 @@ aom_av1_common_intrin_neon = [
"av1/common/arm/wiener_convolve_neon.c",
]
+aom_av1_common_intrin_neon_dotprod = [
+ "av1/common/arm/compound_convolve_neon_dotprod.c",
+ "av1/common/arm/convolve_neon_dotprod.c",
+]
+
+aom_av1_common_intrin_neon_i8mm = [
+ "av1/common/arm/compound_convolve_neon_i8mm.c",
+ "av1/common/arm/convolve_neon_i8mm.c",
+ "av1/common/arm/warp_plane_neon_i8mm.c",
+]
+
aom_av1_common_intrin_sse2 = [
"av1/common/x86/cdef_block_sse2.c",
"av1/common/x86/cfl_sse2.c",
"av1/common/x86/convolve_2d_sse2.c",
"av1/common/x86/convolve_sse2.c",
"av1/common/x86/jnt_convolve_sse2.c",
- "av1/common/x86/warp_plane_sse2.c",
"av1/common/x86/wiener_convolve_sse2.c",
]
@@ -78,6 +95,10 @@ aom_av1_common_intrin_ssse3 = [
"av1/common/x86/resize_ssse3.c",
]
+aom_av1_common_intrin_sve = [
+ "av1/common/arm/warp_plane_sve.c",
+]
+
aom_av1_common_intrin_vsx = [
"av1/common/ppc/cfl_ppc.c",
]
@@ -141,7 +162,7 @@ aom_av1_encoder_asm_ssse3_x86_64 = [
]
aom_av1_encoder_intrin_arm_crc32 = [
- "av1/encoder/arm/crc32/hash_crc32.c",
+ "av1/encoder/arm/crc32/hash_arm_crc32.c",
]
aom_av1_encoder_intrin_avx2 = [
@@ -155,6 +176,7 @@ aom_av1_encoder_intrin_avx2 = [
"av1/encoder/x86/highbd_block_error_intrin_avx2.c",
"av1/encoder/x86/highbd_fwd_txfm_avx2.c",
"av1/encoder/x86/highbd_temporal_filter_avx2.c",
+ "av1/encoder/x86/ml_avx2.c",
"av1/encoder/x86/pickrst_avx2.c",
"av1/encoder/x86/rdopt_avx2.c",
"av1/encoder/x86/temporal_filter_avx2.c",
@@ -168,9 +190,12 @@ aom_av1_encoder_intrin_neon = [
"av1/encoder/arm/neon/av1_k_means_neon.c",
"av1/encoder/arm/neon/encodetxb_neon.c",
"av1/encoder/arm/neon/highbd_fwd_txfm_neon.c",
+ "av1/encoder/arm/neon/highbd_pickrst_neon.c",
+ "av1/encoder/arm/neon/highbd_rdopt_neon.c",
+ "av1/encoder/arm/neon/highbd_temporal_filter_neon.c",
"av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c",
"av1/encoder/arm/neon/ml_neon.c",
- "av1/encoder/arm/neon/picksrt_neon.c",
+ "av1/encoder/arm/neon/pickrst_neon.c",
"av1/encoder/arm/neon/quantize_neon.c",
"av1/encoder/arm/neon/rdopt_neon.c",
"av1/encoder/arm/neon/reconinter_enc_neon.c",
@@ -178,6 +203,10 @@ aom_av1_encoder_intrin_neon = [
"av1/encoder/arm/neon/wedge_utils_neon.c",
]
+aom_av1_encoder_intrin_neon_dotprod = [
+ "av1/encoder/arm/neon/temporal_filter_neon_dotprod.c",
+]
+
aom_av1_encoder_intrin_sse2 = [
"av1/encoder/x86/av1_fwd_txfm_sse2.c",
"av1/encoder/x86/av1_k_means_sse2.c",
@@ -339,6 +368,10 @@ aom_dsp_common_intrin_neon = [
"aom_dsp/arm/avg_pred_neon.c",
"aom_dsp/arm/blend_a64_mask_neon.c",
"aom_dsp/arm/fwd_txfm_neon.c",
+ "aom_dsp/arm/highbd_blend_a64_hmask_neon.c",
+ "aom_dsp/arm/highbd_blend_a64_mask_neon.c",
+ "aom_dsp/arm/highbd_blend_a64_vmask_neon.c",
+ "aom_dsp/arm/highbd_convolve8_neon.c",
"aom_dsp/arm/highbd_intrapred_neon.c",
"aom_dsp/arm/highbd_loopfilter_neon.c",
"aom_dsp/arm/intrapred_neon.c",
@@ -346,6 +379,14 @@ aom_dsp_common_intrin_neon = [
"aom_dsp/arm/subtract_neon.c",
]
+aom_dsp_common_intrin_neon_dotprod = [
+ "aom_dsp/arm/aom_convolve8_neon_dotprod.c",
+]
+
+aom_dsp_common_intrin_neon_i8mm = [
+ "aom_dsp/arm/aom_convolve8_neon_i8mm.c",
+]
+
aom_dsp_common_intrin_sse2 = [
"aom_dsp/x86/aom_asm_stubs.c",
"aom_dsp/x86/aom_convolve_copy_sse2.c",
@@ -441,12 +482,19 @@ aom_dsp_encoder_intrin_avx2 = [
aom_dsp_encoder_intrin_neon = [
"aom_dsp/arm/avg_neon.c",
+ "aom_dsp/arm/blk_sse_sum_neon.c",
"aom_dsp/arm/hadamard_neon.c",
"aom_dsp/arm/highbd_avg_neon.c",
+ "aom_dsp/arm/highbd_avg_pred_neon.c",
"aom_dsp/arm/highbd_hadamard_neon.c",
+ "aom_dsp/arm/highbd_masked_sad_neon.c",
+ "aom_dsp/arm/highbd_obmc_sad_neon.c",
+ "aom_dsp/arm/highbd_obmc_variance_neon.c",
"aom_dsp/arm/highbd_quantize_neon.c",
- "aom_dsp/arm/highbd_sad4d_neon.c",
"aom_dsp/arm/highbd_sad_neon.c",
+ "aom_dsp/arm/highbd_sadxd_neon.c",
+ "aom_dsp/arm/highbd_sse_neon.c",
+ "aom_dsp/arm/highbd_subpel_variance_neon.c",
"aom_dsp/arm/highbd_variance_neon.c",
"aom_dsp/arm/masked_sad4d_neon.c",
"aom_dsp/arm/masked_sad_neon.c",
@@ -458,6 +506,16 @@ aom_dsp_encoder_intrin_neon = [
"aom_dsp/arm/subpel_variance_neon.c",
"aom_dsp/arm/sum_squares_neon.c",
"aom_dsp/arm/variance_neon.c",
+ "aom_dsp/flow_estimation/arm/disflow_neon.c",
+]
+
+aom_dsp_encoder_intrin_neon_dotprod = [
+ "aom_dsp/arm/highbd_variance_neon_dotprod.c",
+ "aom_dsp/arm/sad_neon_dotprod.c",
+ "aom_dsp/arm/sadxd_neon_dotprod.c",
+ "aom_dsp/arm/sse_neon_dotprod.c",
+ "aom_dsp/arm/sum_squares_neon_dotprod.c",
+ "aom_dsp/arm/variance_neon_dotprod.c",
]
aom_dsp_encoder_intrin_sse2 = [
@@ -658,7 +716,7 @@ cc_library_static {
aom_dsp_common_sources +
aom_dsp_decoder_sources +
aom_mem_sources +
- ["aom_ports/arm_cpudetect.c"] +
+ ["aom_ports/aarch64_cpudetect.c"] +
aom_rtcd_sources +
aom_scale_sources +
aom_sources +
@@ -684,7 +742,7 @@ cc_library_static {
aom_dsp_decoder_sources +
aom_mem_sources +
aom_rtcd_sources +
- ["aom_ports/arm_cpudetect.c"] +
+ ["aom_ports/aarch32_cpudetect.c"] +
aom_scale_sources +
aom_sources +
aom_util_sources +
diff --git a/Android.bp.in b/Android.bp.in
index 2295101e7..f5b4939cd 100644
--- a/Android.bp.in
+++ b/Android.bp.in
@@ -64,7 +64,7 @@ cc_library_static {
aom_dsp_common_sources +
aom_dsp_decoder_sources +
aom_mem_sources +
- ["aom_ports/arm_cpudetect.c"] +
+ ["aom_ports/aarch64_cpudetect.c"] +
aom_rtcd_sources +
aom_scale_sources +
aom_sources +
@@ -90,7 +90,7 @@ cc_library_static {
aom_dsp_decoder_sources +
aom_mem_sources +
aom_rtcd_sources +
- ["aom_ports/arm_cpudetect.c"] +
+ ["aom_ports/aarch32_cpudetect.c"] +
aom_scale_sources +
aom_sources +
aom_util_sources +
diff --git a/Android.mk b/Android.mk
deleted file mode 100644
index 612c879b4..000000000
--- a/Android.mk
+++ /dev/null
@@ -1 +0,0 @@
-#Empty Android.mk to shadow third_party/libwebm/Android.mk
diff --git a/CHANGELOG b/CHANGELOG
index b573eda2b..daa83a1a6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,85 @@
+2024-01-17 v3.8.1
+ This release includes several bug fixes. This release is ABI
+ compatible with the last release. See
+ https://aomedia.googlesource.com/aom/+log/v3.8.0..v3.8.1 for all the
+ commits in this release.
+
+ - Bug Fixes
+ * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1'
+ failed.
+ * aomedia:3526: alloc_compressor_data() is called during every
+ aom_codec_control() call on the encoder.
+ * aomedia:3527: aom/av1/encoder/mcomp.c:1810: av1_full_pixel_search:
+ Assertion `ms_params->ms_buffers.ref->width ==
+ ms_params->ms_buffers.src->width' failed.
+ * aomedia:3534: libaom encoder crashed by AOM_USAGE_ALL_INTRA and
+ AOM_EFLAG_NO_REF_LAST flags.
+ * b/310455204: Recreate workers if necessary.
+ * b/310548198: Update frame size in actual encoding.
+ * b/314858909: Do not use adaptive error estimate.
+ * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later.
+
+2023-11-30 v3.8.0
+ This release includes new codec interfaces, compression efficiency and
+ perceptual improvements, speedup and memory optimizations and many bug
+ fixes. This release is ABI compatible with the last release.
+
+ - New Features
+ * New codec controls:
+ * AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR: Set the maximum number of
+ consecutive frame drops allowed for the frame dropper in 1 pass
+ CBR mode.
+ * Run-time CPU feature detection for all Arm platforms:
+ CRC, DotProd, I8MM and SVE CPU feature presence is detected at run
+ time and code paths making use of these features are selected
+ dynamically. These code paths provide meaningful performance gains
+ for standard bitdepth RTC and VoD encoding: up to 10% and 20%
+ respectively, over the Armv8.0-A baseline build.
+ * RTC: Frame-dropper support added to the rate control library.
+ * RTC Rate control improvements for low bitrate and for SVC.
+
+ - Compression Efficiency Improvements
+ * Improved accuracy of cost estimation for loop restoration and
+ global motion.
+ * Improved selection of loop restoration unit size - full search up
+ to (non-realtime) speed 2, retuned static selection at higher
+ speeds.
+ * RTC Screen content mode: 3-5% bdrate gains across speeds 7 - 10.
+ * Good-quality mode: 0.2 - 0.5% bdrate gains across speeds 1 - 4.
+
+ - Perceptual Quality Improvements
+ * RTC Screen: Improved visual quality for scrolling.
+ * RTC: Improved color quality for both screen and video mode.
+
+ - Speedup and Memory Optimizations
+ * Good-quality, single-thread encoder speedups:
+ o 15% improvement for speed 5.
+ o 12% improvement for speed 6.
+ * Arm standard bitdepth VoD (--good):
+ o 8% speedup for speeds 0 and 1.
+ o 20% speedup for speed 2.
+ o 27% speedup for speed 3.
+ o 30% speedup for speed 4.
+ o 38% speedup for speeds 5 and 6.
+ * Arm high bitdepth VoD (--good):
+ o 206% speedup for speeds 0 and 1.
+ o 180% speedup for speed 2.
+ o 51% speedup for speeds 3 and 4.
+ o 68% speedup for speed 5.
+ o 72% speedup for speed 6.
+ * RTC Screen content: 2-6% speedup across speeds 7-10.
+ * RTC: 2-3% speedup for temporal layers.
+ * RTC: Speedups to reference scaling in nonrd pickmode.
+ * Good-quality mode: Simplified global motion estimation, saving
+ ~1200 lines of code and 1KB of tables while improving quality.
+
+ - Bug Fixes
+ * Fixes to improve libaom stability in case of memory allocation
+ failures.
+ * Fixes to SIMD functions (x86 AVX2/SSE2 and ARM Neon).
+ * b/310457427, b/310766628: Bug fixes to only use rec_sse in CBR
+ mode.
+
2023-11-17 v3.7.1
This release includes several bug fixes. This release is ABI
compatible with the last release. See
@@ -9,7 +91,7 @@
* aomedia:3478: GCC 12.2.0 emits a -Wstringop-overflow warning on
aom/av1/encoder/motion_search_facade.c
* aomedia:3489: Detect encoder and image high bit depth mismatch
- * aomedia:3491: heap-buffer-overflow on frame size change
+ * aomedia:3491: heap-buffer-overflow on frame size change (CVE-2023-6879)
* b/303023614: Segfault at encoding time for high bit depth images
2023-08-10 v3.7.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e0b65fc9..308a93d19 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,9 +58,9 @@ endif()
# passed to libtool.
#
# We set SO_FILE_VERSION = [c-a].a.r
-set(LT_CURRENT 10)
+set(LT_CURRENT 11)
set(LT_REVISION 1)
-set(LT_AGE 7)
+set(LT_AGE 8)
math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
unset(LT_CURRENT)
@@ -288,9 +288,9 @@ if(BUILD_SHARED_LIBS)
add_library(aom_static STATIC ${target_objs_aom} $<TARGET_OBJECTS:aom_rtcd>)
set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom)
if(MSVC OR (WIN32 AND NOT MINGW))
- # Fix race condition on the export library file between the two versions.
- # Affects MSVC in all three flavors (stock, Clang/CL, LLVM-- the latter sets
- # MSVC and MINGW both to FALSE).
+ # Fix race condition between the import library and the static library.
+ # Affects MSVC in all three flavors (stock, clang-cl, LLVM -- the latter
+ # sets MSVC and MINGW both to FALSE).
set_target_properties(aom PROPERTIES ARCHIVE_OUTPUT_NAME "aom_dll")
endif()
@@ -323,7 +323,7 @@ if(NOT WIN32 AND NOT APPLE)
endif()
endif()
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h"
"${AOM_ROOT}/av1/ratectrl_rtc.cc")
add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES})
@@ -336,7 +336,7 @@ endif()
# List of object and static library targets.
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc)
endif()
if(BUILD_SHARED_LIBS)
@@ -387,13 +387,6 @@ if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
endif()
endif()
-if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
- add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
- $<TARGET_OBJECTS:aom_common_app_util>)
- set_property(TARGET ${example} PROPERTY FOLDER examples)
- list(APPEND AOM_APP_TARGETS resize_util)
-endif()
-
if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
$<TARGET_OBJECTS:aom_common_app_util>
@@ -494,14 +487,18 @@ if(CONFIG_AV1_ENCODER)
$<TARGET_OBJECTS:aom_common_app_util>
$<TARGET_OBJECTS:aom_encoder_app_util>)
- add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
- $<TARGET_OBJECTS:aom_common_app_util>
- $<TARGET_OBJECTS:aom_encoder_app_util>)
-
# Maintain a list of encoder example targets.
list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
photon_noise_table set_maps simple_encoder scalable_encoder
- twopass_encoder svc_encoder_rtc)
+ twopass_encoder)
+
+ if(NOT BUILD_SHARED_LIBS)
+ add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc)
+ list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc)
+ endif()
endif()
if(ENABLE_TOOLS)
@@ -852,7 +849,7 @@ set_user_flags()
# Aomedia documentation rule.
set(DOXYGEN_VERSION_VALUE 0)
if(ENABLE_DOCS)
- include(FindDoxygen)
+ find_package(Doxygen)
if(DOXYGEN_FOUND)
# Check if Doxygen version is >= minimum required version(i.e. 1.8.10).
set(MINIMUM_DOXYGEN_VERSION 1008010)
@@ -942,7 +939,8 @@ endif()
get_cmake_property(all_cmake_vars VARIABLES)
foreach(var ${all_cmake_vars})
if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_"
- AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST")
+ AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST"
+ AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER_")
list(APPEND aom_source_vars ${var})
endif()
endforeach()
diff --git a/METADATA b/METADATA
index 8e6345485..35df8b861 100644
--- a/METADATA
+++ b/METADATA
@@ -20,10 +20,10 @@ third_party {
type: GIT
value: "https://aomedia.googlesource.com/aom/"
}
- version: "v3.7.1"
+ version: "v3.8.1"
last_upgrade_date {
- year: 2023
- month: 11
- day: 21
+ year: 2024
+ month: 1
+ day: 22
}
}
diff --git a/README.android b/README.android
index d8a109ec2..50860bfd4 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
Name: libaom
URL: https://aomedia.org
-Version: v3.7.1
+Version: v3.8.1
License: BSD
License File: libaom/LICENSE
-Date: Tuesday November 21 2023
-Branch: v3.7.1
-Commit: aca387522ccc0a1775716923d5489dd2d4b1e628
+Date: Monday January 22 2024
+Branch: v3.8.1
+Commit: bb6430482199eaefbeaaa396600935082bc43f66
Description:
Contains the sources used to compile libaom.
diff --git a/README.md b/README.md
index d7b66e028..4e2eb2756 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,7 @@ cross compiling via the use of toolchain files included in the AV1 repository.
The toolchain files available at the time of this writing are:
- arm64-ios.cmake
+ - arm64-linux-clang.cmake
- arm64-linux-gcc.cmake
- arm64-mingw-gcc.cmake
- armv7-ios.cmake
diff --git a/README.version b/README.version
deleted file mode 100644
index 5b03ffdf8..000000000
--- a/README.version
+++ /dev/null
@@ -1,3 +0,0 @@
-URL: https://aomedia.googlesource.com/aom/
-Version: v3.7.1
-Local Modifications:
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index e3d8d2993..5d0bbe103 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -1006,11 +1006,11 @@ aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
-#define AOM_USAGE_GOOD_QUALITY (0)
+#define AOM_USAGE_GOOD_QUALITY 0u
/*!\brief usage parameter analogous to AV1 REALTIME mode. */
-#define AOM_USAGE_REALTIME (1)
+#define AOM_USAGE_REALTIME 1u
/*!\brief usage parameter analogous to AV1 all intra mode. */
-#define AOM_USAGE_ALL_INTRA (2)
+#define AOM_USAGE_ALL_INTRA 2u
/*!\brief Encode a frame
*
diff --git a/aom/aomcx.h b/aom/aomcx.h
index a5db0a52f..f061be3f9 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -208,14 +208,14 @@ enum aome_enc_control_id {
* encoding process, values greater than 0 will increase encoder speed at
* the expense of quality.
*
- * Valid range: 0..10. 0 runs the slowest, and 10 runs the fastest;
+ * Valid range: 0..11. 0 runs the slowest, and 11 runs the fastest;
* quality improves as speed decreases (since more compression
* possibilities are explored).
*
- * NOTE: 10 is only allowed in AOM_USAGE_REALTIME. In AOM_USAGE_GOOD_QUALITY
- * and AOM_USAGE_ALL_INTRA, 9 is the highest allowed value. However,
- * AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also, AOM_USAGE_REALTIME
- * treats 0..4 the same as 5.
+ * NOTE: 10 and 11 are only allowed in AOM_USAGE_REALTIME. In
+ * AOM_USAGE_GOOD_QUALITY and AOM_USAGE_ALL_INTRA, 9 is the highest allowed
+ * value. However, AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also,
+ * AOM_USAGE_REALTIME treats 0..4 the same as 5.
*/
AOME_SET_CPUUSED = 13,
@@ -1527,6 +1527,12 @@ enum aome_enc_control_id {
*/
AV1E_SET_BITRATE_ONE_PASS_CBR = 163,
+ /*!\brief Codec control to set the maximum number of consecutive frame drops
+ * allowed for the frame dropper in 1 pass CBR mode, int parameter. Value of
+ * zero has no effect.
+ */
+ AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164,
+
// Any new encoder control IDs should be added above.
// Maximum allowed encoder control ID is 229.
// No encoder control ID should be added below.
@@ -1678,10 +1684,10 @@ typedef struct aom_svc_params {
/*!brief Parameters for setting ref frame config */
typedef struct aom_svc_ref_frame_config {
- // 7 references: LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2),
- // GOLDEN_FRAME(3), BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ // 7 references: The index 0 - 6 refers to the references:
+ // last(0), last2(1), last3(2), golden(3), bwdref(4), altref2(5), altref(6).
int reference[7]; /**< Reference flag for each of the 7 references. */
- /*! Buffer slot index for each of 7 references. */
+ /*! Buffer slot index for each of 7 references indexed above. */
int ref_idx[7];
int refresh[8]; /**< Refresh flag for each of the 8 slots. */
} aom_svc_ref_frame_config_t;
@@ -2172,6 +2178,9 @@ AOM_CTRL_USE_TYPE(AV1E_GET_LUMA_CDEF_STRENGTH, int *)
AOM_CTRL_USE_TYPE(AV1E_SET_BITRATE_ONE_PASS_CBR, unsigned int)
#define AOM_CTRL_AV1E_SET_BITRATE_ONE_PASS_CBR
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int)
+#define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR
+
/*!\endcond */
/*! @} - end defgroup aom_encoder */
#ifdef __cplusplus
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4c60e5c5e..f8f2cbba5 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -115,12 +115,17 @@ list(APPEND AOM_DSP_COMMON_INTRIN_NEON
"${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
"${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
"${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
"${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
"${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
+
if(CONFIG_AV1_HIGHBITDEPTH)
list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
@@ -134,6 +139,11 @@ if(CONFIG_AV1_HIGHBITDEPTH)
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c")
endif()
@@ -191,6 +201,9 @@ if(CONFIG_AV1_ENCODER)
list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c")
endif()
list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
@@ -269,7 +282,15 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
"${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
+ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")
if(CONFIG_AV1_HIGHBITDEPTH)
list(APPEND AOM_DSP_ENCODER_ASM_SSE2
@@ -292,11 +313,20 @@ if(CONFIG_AV1_ENCODER)
list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
"${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/highbd_sad4d_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c")
endif()
if(CONFIG_INTERNAL_STATS)
@@ -326,6 +356,10 @@ if(CONFIG_AV1_ENCODER)
list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c")
+
+ list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c")
endif()
endif()
@@ -433,6 +467,23 @@ function(setup_aom_dsp_targets)
endif()
endif()
+ if(HAVE_NEON_DOTPROD)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD")
+ endif()
+ endif()
+
+ if(HAVE_NEON_I8MM)
+ add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+ "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_NEON_I8MM")
+ endif()
+
target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
if(BUILD_SHARED_LIBS)
target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index efb634ac9..85dc0052e 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -23,10 +23,6 @@ extern "C" {
#define PI 3.141592653589793238462643383279502884
-#ifndef MAX_SB_SIZE
-#define MAX_SB_SIZE 128
-#endif // ndef MAX_SB_SIZE
-
#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
#define AOMSIGN(x) ((x) < 0 ? -1 : 0)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e738971c6..c9b268267 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -497,22 +497,22 @@ add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t
add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve_copy neon sse2 avx2/;
-specialize qw/aom_convolve8_horiz neon sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert neon sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve_copy neon sse2 avx2/;
+specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/aom_scaled_2d ssse3 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
- specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+ specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/;
add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
- specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
+ specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/;
add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
- specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
+ specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/;
}
#
@@ -750,7 +750,7 @@ specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
+specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/;
specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
@@ -759,10 +759,10 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
- specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
- specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
- specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
- specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/;
+ specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/;
}
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@@ -773,35 +773,33 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_subtract_block neon sse2 avx2/;
add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
- specialize qw/aom_sse sse4_1 avx2 neon/;
+ specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/;
add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
- specialize qw/aom_get_blk_sse_sum sse2 avx2/;
+ specialize qw/aom_get_blk_sse_sum sse2 avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/aom_highbd_subtract_block sse2 neon/;
add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
- specialize qw/aom_highbd_sse sse4_1 avx2 neon/;
+ specialize qw/aom_highbd_sse sse4_1 avx2 neon/;
}
- if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- #
- # Sum of Squares
- #
- add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
- specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
+ #
+ # Sum of Squares
+ #
+ add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+ specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
- add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
- specialize qw/aom_sum_squares_i16 sse2 neon/;
+ add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+ specialize qw/aom_sum_squares_i16 sse2 neon/;
- add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
- specialize qw/aom_var_2d_u8 sse2 avx2 neon/;
+ add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/;
- add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
- specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
- }
+ add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
#
# Single block SAD / Single block Avg SAD
@@ -816,65 +814,65 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2/;
- specialize qw/aom_sad128x128 avx2 neon sse2/;
- specialize qw/aom_sad128x64 avx2 neon sse2/;
- specialize qw/aom_sad64x128 avx2 neon sse2/;
- specialize qw/aom_sad64x64 avx2 neon sse2/;
- specialize qw/aom_sad64x32 avx2 neon sse2/;
- specialize qw/aom_sad32x64 avx2 neon sse2/;
- specialize qw/aom_sad32x32 avx2 neon sse2/;
- specialize qw/aom_sad32x16 avx2 neon sse2/;
- specialize qw/aom_sad16x32 neon sse2/;
- specialize qw/aom_sad16x16 neon sse2/;
- specialize qw/aom_sad16x8 neon sse2/;
- specialize qw/aom_sad8x16 neon sse2/;
- specialize qw/aom_sad8x8 neon sse2/;
- specialize qw/aom_sad8x4 neon sse2/;
- specialize qw/aom_sad4x8 neon sse2/;
- specialize qw/aom_sad4x4 neon sse2/;
-
- specialize qw/aom_sad4x16 neon sse2/;
- specialize qw/aom_sad16x4 neon sse2/;
- specialize qw/aom_sad8x32 neon sse2/;
- specialize qw/aom_sad32x8 neon sse2/;
- specialize qw/aom_sad16x64 neon sse2/;
- specialize qw/aom_sad64x16 neon sse2/;
-
- specialize qw/aom_sad_skip_128x128 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_128x64 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x128 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x64 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x32 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x64 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x32 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x16 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x32 sse2 neon/;
- specialize qw/aom_sad_skip_16x16 sse2 neon/;
- specialize qw/aom_sad_skip_16x8 sse2 neon/;
- specialize qw/aom_sad_skip_8x16 sse2 neon/;
- specialize qw/aom_sad_skip_8x8 sse2 neon/;
- specialize qw/aom_sad_skip_8x4 neon/;
- specialize qw/aom_sad_skip_4x8 sse2 neon/;
- specialize qw/aom_sad_skip_4x4 neon/;
-
- specialize qw/aom_sad_skip_4x16 sse2 neon/;
- specialize qw/aom_sad_skip_16x4 neon/;
- specialize qw/aom_sad_skip_8x32 sse2 neon/;
- specialize qw/aom_sad_skip_32x8 sse2 neon/;
- specialize qw/aom_sad_skip_16x64 sse2 neon/;
- specialize qw/aom_sad_skip_64x16 sse2 neon/;
-
- specialize qw/aom_sad128x128_avg avx2 sse2 neon/;
- specialize qw/aom_sad128x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x128_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x32_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x32_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x16_avg avx2 sse2 neon/;
- specialize qw/aom_sad16x32_avg sse2 neon/;
- specialize qw/aom_sad16x16_avg sse2 neon/;
- specialize qw/aom_sad16x8_avg sse2 neon/;
+ specialize qw/aom_sad128x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x16 sse2 neon/;
+ specialize qw/aom_sad8x8 sse2 neon/;
+ specialize qw/aom_sad8x4 sse2 neon/;
+ specialize qw/aom_sad4x8 sse2 neon/;
+ specialize qw/aom_sad4x4 sse2 neon/;
+
+ specialize qw/aom_sad4x16 sse2 neon/;
+ specialize qw/aom_sad16x4 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x32 sse2 neon/;
+ specialize qw/aom_sad32x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x16 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad_skip_128x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_128x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x16 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x32 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_8x16 sse2 neon/;
+ specialize qw/aom_sad_skip_8x8 sse2 neon/;
+ specialize qw/aom_sad_skip_8x4 neon/;
+ specialize qw/aom_sad_skip_4x8 sse2 neon/;
+ specialize qw/aom_sad_skip_4x4 neon/;
+
+ specialize qw/aom_sad_skip_4x16 sse2 neon/;
+ specialize qw/aom_sad_skip_16x4 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_8x32 sse2 neon/;
+ specialize qw/aom_sad_skip_32x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x64 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x16 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8_avg sse2 neon neon_dotprod/;
specialize qw/aom_sad8x16_avg sse2 neon/;
specialize qw/aom_sad8x8_avg sse2 neon/;
specialize qw/aom_sad8x4_avg sse2 neon/;
@@ -882,36 +880,36 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_sad4x4_avg sse2 neon/;
specialize qw/aom_sad4x16_avg sse2 neon/;
- specialize qw/aom_sad16x4_avg sse2 neon/;
+ specialize qw/aom_sad16x4_avg sse2 neon neon_dotprod/;
specialize qw/aom_sad8x32_avg sse2 neon/;
- specialize qw/aom_sad32x8_avg sse2 neon/;
- specialize qw/aom_sad16x64_avg sse2 neon/;
- specialize qw/aom_sad64x16_avg sse2 neon/;
-
- specialize qw/aom_dist_wtd_sad128x128_avg sse2/;
- specialize qw/aom_dist_wtd_sad128x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x128_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x4_avg sse2/;
- specialize qw/aom_dist_wtd_sad4x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad4x4_avg sse2/;
+ specialize qw/aom_sad32x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x16_avg sse2 neon neon_dotprod/;
+
+ specialize qw/aom_dist_wtd_sad128x128_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad128x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x128_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad8x16_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad8x8_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad8x4_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad4x8_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad4x4_avg sse2 neon/;
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- specialize qw/aom_dist_wtd_sad4x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x4_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x16_avg sse2/;
+ specialize qw/aom_dist_wtd_sad4x16_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad16x4_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad8x32_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad32x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x16_avg sse2 neon neon_dotprod/;
}
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -974,27 +972,29 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_highbd_sad_skip_16x64 avx2 sse2 neon/;
specialize qw/aom_highbd_sad_skip_64x16 avx2 sse2 neon/;
- specialize qw/aom_highbd_sad128x128_avg avx2/;
- specialize qw/aom_highbd_sad128x64_avg avx2/;
- specialize qw/aom_highbd_sad64x128_avg avx2/;
- specialize qw/aom_highbd_sad64x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad64x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x16_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x16_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x8_avg avx2 sse2/;
- specialize qw/aom_highbd_sad8x4_avg sse2/;
- specialize qw/aom_highbd_sad4x8_avg sse2/;
- specialize qw/aom_highbd_sad4x4_avg sse2/;
-
- specialize qw/aom_highbd_sad4x16_avg sse2/;
- specialize qw/aom_highbd_sad16x4_avg avx2 sse2/;
- specialize qw/aom_highbd_sad8x32_avg sse2/;
- specialize qw/aom_highbd_sad32x8_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad64x16_avg avx2 sse2/;
+ specialize qw/aom_highbd_sad128x128_avg avx2 neon/;
+ specialize qw/aom_highbd_sad128x64_avg avx2 neon/;
+ specialize qw/aom_highbd_sad64x128_avg avx2 neon/;
+ specialize qw/aom_highbd_sad64x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x16_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x16_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x8_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad8x16_avg neon/;
+ specialize qw/aom_highbd_sad8x8_avg neon/;
+ specialize qw/aom_highbd_sad8x4_avg sse2 neon/;
+ specialize qw/aom_highbd_sad4x8_avg sse2 neon/;
+ specialize qw/aom_highbd_sad4x4_avg sse2 neon/;
+
+ specialize qw/aom_highbd_sad4x16_avg sse2 neon/;
+ specialize qw/aom_highbd_sad8x32_avg sse2 neon/;
+ specialize qw/aom_highbd_sad16x4_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x8_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x16_avg avx2 sse2 neon/;
}
#
# Masked SAD
@@ -1009,7 +1009,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
- specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
+ specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
}
}
@@ -1030,7 +1030,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
- specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+ specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
}
}
}
@@ -1047,47 +1047,47 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
}
- specialize qw/aom_sad128x128x4d avx2 neon sse2/;
- specialize qw/aom_sad128x64x4d avx2 neon sse2/;
- specialize qw/aom_sad64x128x4d avx2 neon sse2/;
- specialize qw/aom_sad64x64x4d avx2 neon sse2/;
- specialize qw/aom_sad64x32x4d avx2 neon sse2/;
- specialize qw/aom_sad32x64x4d avx2 neon sse2/;
- specialize qw/aom_sad32x32x4d avx2 neon sse2/;
- specialize qw/aom_sad32x16x4d avx2 neon sse2/;
- specialize qw/aom_sad16x32x4d avx2 neon sse2/;
- specialize qw/aom_sad16x16x4d avx2 neon sse2/;
- specialize qw/aom_sad16x8x4d avx2 neon sse2/;
-
- specialize qw/aom_sad8x16x4d neon sse2/;
- specialize qw/aom_sad8x8x4d neon sse2/;
- specialize qw/aom_sad8x4x4d neon sse2/;
- specialize qw/aom_sad4x8x4d neon sse2/;
- specialize qw/aom_sad4x4x4d neon sse2/;
-
- specialize qw/aom_sad64x16x4d avx2 neon sse2/;
- specialize qw/aom_sad32x8x4d avx2 neon sse2/;
- specialize qw/aom_sad16x64x4d avx2 neon sse2/;
- specialize qw/aom_sad16x4x4d avx2 neon sse2/;
- specialize qw/aom_sad8x32x4d neon sse2/;
- specialize qw/aom_sad4x16x4d neon sse2/;
-
- specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x128x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x32x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x16x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x32x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon/;
-
- specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x4x4d neon/;
+ specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8x4d avx2 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad8x16x4d sse2 neon/;
+ specialize qw/aom_sad8x8x4d sse2 neon/;
+ specialize qw/aom_sad8x4x4d sse2 neon/;
+ specialize qw/aom_sad4x8x4d sse2 neon/;
+ specialize qw/aom_sad4x4x4d sse2 neon/;
+
+ specialize qw/aom_sad64x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x8x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x4x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x32x4d sse2 neon/;
+ specialize qw/aom_sad4x16x4d sse2 neon/;
+
+ specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x4x4d neon neon_dotprod/;
specialize qw/aom_sad_skip_8x32x4d sse2 neon/;
specialize qw/aom_sad_skip_8x16x4d sse2 neon/;
specialize qw/aom_sad_skip_8x8x4d sse2 neon/;
@@ -1096,29 +1096,29 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_sad_skip_4x8x4d sse2 neon/;
specialize qw/aom_sad_skip_4x4x4d neon/;
- specialize qw/aom_sad128x128x3d neon avx2/;
- specialize qw/aom_sad128x64x3d neon avx2/;
- specialize qw/aom_sad64x128x3d neon avx2/;
- specialize qw/aom_sad64x64x3d neon avx2/;
- specialize qw/aom_sad64x32x3d neon avx2/;
- specialize qw/aom_sad32x64x3d neon avx2/;
- specialize qw/aom_sad32x32x3d neon avx2/;
- specialize qw/aom_sad32x16x3d neon avx2/;
- specialize qw/aom_sad16x32x3d neon avx2/;
- specialize qw/aom_sad16x16x3d neon avx2/;
- specialize qw/aom_sad16x8x3d neon avx2/;
- specialize qw/aom_sad8x16x3d neon/;
- specialize qw/aom_sad8x8x3d neon/;
- specialize qw/aom_sad8x4x3d neon/;
- specialize qw/aom_sad4x8x3d neon/;
- specialize qw/aom_sad4x4x3d neon/;
-
- specialize qw/aom_sad64x16x3d neon avx2/;
- specialize qw/aom_sad32x8x3d neon avx2/;
- specialize qw/aom_sad16x64x3d neon avx2/;
- specialize qw/aom_sad16x4x3d neon/;
- specialize qw/aom_sad8x32x3d neon/;
- specialize qw/aom_sad4x16x3d neon/;
+ specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad8x16x3d neon/;
+ specialize qw/aom_sad8x8x3d neon/;
+ specialize qw/aom_sad8x4x3d neon/;
+ specialize qw/aom_sad4x8x3d neon/;
+ specialize qw/aom_sad4x4x3d neon/;
+
+ specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x4x3d neon neon_dotprod/;
+ specialize qw/aom_sad8x32x3d neon/;
+ specialize qw/aom_sad4x16x3d neon/;
specialize qw/aom_masked_sad128x128x4d ssse3 neon/;
specialize qw/aom_masked_sad128x64x4d ssse3 neon/;
@@ -1153,9 +1153,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
- add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
- add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
- add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
if ($w != 128 && $h != 128) {
specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
}
@@ -1208,22 +1208,29 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_highbd_sad_skip_16x64x4d avx2 sse2 neon/;
specialize qw/aom_highbd_sad_skip_64x16x4d avx2 sse2 neon/;
- specialize qw/aom_highbd_sad128x128x3d avx2/;
- specialize qw/aom_highbd_sad128x64x3d avx2/;
- specialize qw/aom_highbd_sad64x128x3d avx2/;
- specialize qw/aom_highbd_sad64x64x3d avx2/;
- specialize qw/aom_highbd_sad64x32x3d avx2/;
- specialize qw/aom_highbd_sad32x64x3d avx2/;
- specialize qw/aom_highbd_sad32x32x3d avx2/;
- specialize qw/aom_highbd_sad32x16x3d avx2/;
- specialize qw/aom_highbd_sad16x32x3d avx2/;
- specialize qw/aom_highbd_sad16x16x3d avx2/;
- specialize qw/aom_highbd_sad16x8x3d avx2/;
-
- specialize qw/aom_highbd_sad16x4x3d avx2/;
- specialize qw/aom_highbd_sad32x8x3d avx2/;
- specialize qw/aom_highbd_sad16x64x3d avx2/;
- specialize qw/aom_highbd_sad64x16x3d avx2/;
+ specialize qw/aom_highbd_sad128x128x3d avx2 neon/;
+ specialize qw/aom_highbd_sad128x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x128x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x8x3d avx2 neon/;
+ specialize qw/aom_highbd_sad8x16x3d neon/;
+ specialize qw/aom_highbd_sad8x8x3d neon/;
+ specialize qw/aom_highbd_sad8x4x3d neon/;
+ specialize qw/aom_highbd_sad4x8x3d neon/;
+ specialize qw/aom_highbd_sad4x4x3d neon/;
+
+ specialize qw/aom_highbd_sad64x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x8x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x4x3d avx2 neon/;
+ specialize qw/aom_highbd_sad8x32x3d neon/;
+ specialize qw/aom_highbd_sad4x16x3d neon/;
}
#
# Avg
@@ -1323,20 +1330,20 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# Specialty Variance
#
add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8";
- specialize qw/aom_get_var_sse_sum_8x8_quad avx2 sse2 neon/;
+ specialize qw/aom_get_var_sse_sum_8x8_quad avx2 sse2 neon neon_dotprod/;
add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
- specialize qw/aom_get_var_sse_sum_16x16_dual avx2 sse2 neon/;
+ specialize qw/aom_get_var_sse_sum_16x16_dual avx2 sse2 neon neon_dotprod/;
add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_mse16x16 sse2 avx2 neon/;
- specialize qw/aom_mse16x8 sse2 neon/;
- specialize qw/aom_mse8x16 sse2 neon/;
- specialize qw/aom_mse8x8 sse2 neon/;
+ specialize qw/aom_mse16x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_mse16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_mse8x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_mse8x8 sse2 neon neon_dotprod/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
foreach $bd (8, 10, 12) {
@@ -1345,31 +1352,32 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
- specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+ specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_mse16x8", qw/neon/;
+ specialize "aom_highbd_${bd}_mse8x16", qw/neon/;
+ specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon/;
}
+
+ specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/;
}
#
#
#
add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
- specialize qw/aom_get_mb_ss sse2/;
+ specialize qw/aom_get_mb_ss sse2 neon/;
#
# Variance / Subpixel Variance / Subpixel Avg Variance
#
- add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
specialize qw/aom_mse_wxh_16bit sse2 avx2 neon/;
add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
- specialize qw/aom_mse_16xh_16bit sse2 avx2/;
+ specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/;
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
@@ -1378,22 +1386,22 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
}
- specialize qw/aom_variance128x128 sse2 avx2 neon/;
- specialize qw/aom_variance128x64 sse2 avx2 neon/;
- specialize qw/aom_variance64x128 sse2 avx2 neon/;
- specialize qw/aom_variance64x64 sse2 avx2 neon/;
- specialize qw/aom_variance64x32 sse2 avx2 neon/;
- specialize qw/aom_variance32x64 sse2 avx2 neon/;
- specialize qw/aom_variance32x32 sse2 avx2 neon/;
- specialize qw/aom_variance32x16 sse2 avx2 neon/;
- specialize qw/aom_variance16x32 sse2 avx2 neon/;
- specialize qw/aom_variance16x16 sse2 avx2 neon/;
- specialize qw/aom_variance16x8 sse2 avx2 neon/;
- specialize qw/aom_variance8x16 sse2 neon/;
- specialize qw/aom_variance8x8 sse2 neon/;
- specialize qw/aom_variance8x4 sse2 neon/;
- specialize qw/aom_variance4x8 sse2 neon/;
- specialize qw/aom_variance4x4 sse2 neon/;
+ specialize qw/aom_variance128x128 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance128x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x128 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x8 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance8x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance8x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance8x4 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance4x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance4x4 sse2 neon neon_dotprod/;
specialize qw/aom_sub_pixel_variance128x128 avx2 neon sse2 ssse3/;
specialize qw/aom_sub_pixel_variance128x64 avx2 neon sse2 ssse3/;
@@ -1430,12 +1438,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_sub_pixel_avg_variance4x4 neon sse2 ssse3/;
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- specialize qw/aom_variance4x16 neon sse2/;
- specialize qw/aom_variance16x4 neon sse2 avx2/;
- specialize qw/aom_variance8x32 neon sse2/;
- specialize qw/aom_variance32x8 neon sse2 avx2/;
- specialize qw/aom_variance16x64 neon sse2 avx2/;
- specialize qw/aom_variance64x16 neon sse2 avx2/;
+ specialize qw/aom_variance4x16 neon neon_dotprod sse2/;
+ specialize qw/aom_variance16x4 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance8x32 neon neon_dotprod sse2/;
+ specialize qw/aom_variance32x8 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
@@ -1450,80 +1458,257 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 neon ssse3/;
}
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 ssse3/;
-
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 neon ssse3/;
+
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 neon ssse3/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
foreach $bd (8, 10, 12) {
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
- if ($bd == 10) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse2 neon/;
- } else {
- specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
- }
- }
+ add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+ }
+ }
- if ($w == 4 || $h == 4) {
- # TODO(rachelbarker): When ext-partition-types is enabled, we currently
- # don't have vectorized 4x16 highbd variance functions
- if ($w == 4 && $h == 4) {
- if ($bd == 10) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse4_1 neon/;
- } else {
- specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
- }
- } else {
- if ($bd == 10) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", qw/neon/;
- }
- }
- }
+ specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_12_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_12_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_12_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_variance8x4 neon/;
+ specialize qw/aom_highbd_12_variance4x8 neon/;
+ specialize qw/aom_highbd_12_variance4x4 sse4_1 neon/;
+ specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance8x4 neon/;
+ specialize qw/aom_highbd_10_variance4x8 neon/;
+ specialize qw/aom_highbd_10_variance4x4 sse4_1 neon/;
- if ($w != 128 && $h != 128 && $w != 4) {
- specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
- specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
- }
- if ($w == 4 && $h == 4) {
- specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
- specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
- }
+ specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_8_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_8_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_8_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_variance8x4 neon/;
+ specialize qw/aom_highbd_8_variance4x8 neon/;
+ specialize qw/aom_highbd_8_variance4x4 sse4_1 neon/;
- add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ my $avx2 = ($bd == 10) ? "avx2" : "";
+ specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
+ }
+ }
+
+ specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance4x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/;
+ }
+ }
+
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/;
+ }
+ }
+
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x128", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x128" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x4" , qw/neon/;
+ }
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x16", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x64", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x16" , qw/neon/;
}
}
}
@@ -1541,7 +1726,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
- specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+ specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
}
}
}
@@ -1559,56 +1744,18 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
}
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
- foreach $bd ("_", "_10_", "_12_") {
+ foreach $bd ("_8_", "_10_", "_12_") {
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
- specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+ specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/;
+ specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/;
}
}
}
}
- add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance64x64 avx2 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance64x32 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x64 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x32 avx2 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x16 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x32 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x16 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x8 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x16 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x8 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x4 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance4x8 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance4x4 sse2 ssse3/;
-
#
# Comp Avg
#
@@ -1616,469 +1763,25 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_comp_avg_pred avx2 neon/;
add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
- specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
+ specialize qw/aom_dist_wtd_comp_avg_pred ssse3 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-
- add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance4x8 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance4x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance4x8 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance4x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance4x8 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance4x4 neon/;
-
- if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- foreach $bd (8, 10, 12) {
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance64x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance64x16" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance32x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance32x8" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x64", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance16x64" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance8x32", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance8x32" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
- }
- }
-
- add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse16x8 neon/;
- add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse8x16 neon/;
- add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse16x8 neon/;
- add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse8x16 neon/;
- add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse16x8 neon/;
- add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse8x16 neon/;
- add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse8x8 sse2 neon/;
-
add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+ specialize qw/aom_highbd_comp_avg_pred neon/;
add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
- specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
+ specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/;
add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
- specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2/;
+ specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon/;
}
- #
- # Subpixel Variance
- #
- if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- }
-
add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
specialize qw/aom_comp_mask_pred ssse3 avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
- specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+ specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/;
}
# Flow estimation library
@@ -2087,7 +1790,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v";
- specialize qw/aom_compute_flow_at_point sse4_1/;
+ specialize qw/aom_compute_flow_at_point sse4_1 neon/;
}
} # CONFIG_AV1_ENCODER
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 3d07a0f16..c8ee780b5 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -24,826 +24,6 @@
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
-#if AOM_ARCH_AARCH64 && \
- (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
- 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
- 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
- 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
- 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
- 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
- /* Shift left and insert new last column in transposed 4x4 block. */
- 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
- /* Shift left and insert two new columns in transposed 4x4 block. */
- 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
- /* Shift left and insert three new columns in transposed 4x4 block. */
- 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
-};
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
- const int8x8_t filter,
- const uint8x16x2_t permute_tbl) {
- uint8x16_t permuted_samples[2];
- int32x4_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
- sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
- const int8x8_t filter,
- const uint8x16x3_t permute_tbl) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
- sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
- sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
- uint8x16_t s0, s1, s2, s3;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)x_step_q4;
- (void)filter_y;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1);
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
-
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_usdot(s0, filter, perm_tbl);
- t1 = convolve8_4_usdot(s1, filter, perm_tbl);
- t2 = convolve8_4_usdot(s2, filter, perm_tbl);
- t3 = convolve8_4_usdot(s3, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
- d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- }
-}
-
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b = vqtbl2q_u8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b0, uint8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
- const uint8x16_t samples_hi,
- const int8x8_t filter) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum;
-
- sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
- sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
- const uint8x16_t samples0_hi,
- const uint8x16_t samples1_lo,
- const uint8x16_t samples1_hi,
- const int8x8_t filter) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
- sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
- sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)filter_x;
- (void)x_step_q4;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- src += 7 * src_stride;
-
- s7 = vdup_n_u8(0);
- s8 = vdup_n_u8(0);
- s9 = vdup_n_u8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
- do {
- load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
- d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
- d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
- d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123 = s4567;
- s1234 = s5678;
- s2345 = s6789;
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
-
- load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- s7 = vdup_n_u8(0);
- s8 = vdup_n_u8(0);
- s9 = vdup_n_u8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
- tran_concat_tbl);
-
- do {
- load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- filter);
- d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- filter);
- d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- filter);
- d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123_lo = s4567_lo;
- s0123_hi = s4567_hi;
- s1234_lo = s5678_lo;
- s1234_hi = s5678_hi;
- s2345_lo = s6789_lo;
- s2345_hi = s6789_hi;
- s3456_lo = s78910_lo;
- s3456_hi = s78910_hi;
-
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-#else // !defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
- const int8x8_t filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
- const int8x8_t filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
- sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
- sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x16_t range_limit = vdupq_n_u8(128);
- uint8x16_t s0, s1, s2, s3;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)x_step_q4;
- (void)filter_y;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1);
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
-
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- }
-}
-
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b = vqtbl2q_s8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b0,
- int8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filter) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
- sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
- const int8x16_t samples0_hi,
- const int8x16_t samples1_lo,
- const int8x16_t samples1_hi,
- const int32x4_t correction,
- const int8x8_t filter) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
- sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
- sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x8_t range_limit = vdup_n_u8(128);
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)filter_x;
- (void)x_step_q4;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- src += 7 * src_stride;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
- s7 = vdup_n_s8(0);
- s8 = vdup_n_s8(0);
- s9 = vdup_n_s8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
- do {
- uint8x8_t t7, t8, t9, t10;
-
- load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
- d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
- d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
- d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123 = s4567;
- s1234 = s5678;
- s2345 = s6789;
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
-
- load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s += 7 * src_stride;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
- s7 = vdup_n_s8(0);
- s8 = vdup_n_s8(0);
- s9 = vdup_n_s8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
- tran_concat_tbl);
-
- do {
- uint8x8_t t7, t8, t9, t10;
-
- load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filter);
- d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filter);
- d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filter);
- d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123_lo = s4567_lo;
- s0123_hi = s4567_hi;
- s1234_lo = s5678_lo;
- s1234_hi = s5678_hi;
- s2345_lo = s6789_lo;
- s2345_hi = s6789_hi;
- s3456_lo = s78910_lo;
- s3456_hi = s78910_hi;
-
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-#endif // defined(__ARM_FEATURE_MATMUL_INT8)
-
-#else // !(AOM_ARCH_AARCH64 &&
- // (defined(__ARM_FEATURE_DOTPROD) ||
- // defined(__ARM_FEATURE_MATMUL_INT8)))
-
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
@@ -905,7 +85,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -918,7 +98,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
do {
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -931,7 +111,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
- transpose_u8_4x4(&d01, &d23);
+ transpose_elems_inplace_u8_4x4(&d01, &d23);
store_u8_4x1(dst + 0 * dst_stride, d01, 0);
store_u8_4x1(dst + 1 * dst_stride, d23, 0);
@@ -956,7 +136,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
if (w == 4) {
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -967,7 +147,8 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
&t7);
- transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+ transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+ &t3);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -978,7 +159,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- transpose_u8_8x4(&d0, &d1, &d2, &d3);
+ transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
store_u8_4x1(dst + 0 * dst_stride, d0, 0);
store_u8_4x1(dst + 1 * dst_stride, d1, 0);
@@ -1002,7 +183,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1017,7 +198,8 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
do {
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1036,7 +218,8 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
- transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
@@ -1172,5 +355,3 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
} while (w != 0);
}
}
-
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
new file mode 100644
index 000000000..e56541406
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[2];
+ int32x4_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+ sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+ t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+ t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+ t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+ d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filter);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filter);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filter);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
new file mode 100644
index 000000000..d778e8aed
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+ t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+ t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+ t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+ d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filter);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filter);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filter);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index 583d83211..d746f9e4d 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -50,3 +50,104 @@ void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w < 8) { // copy4
+ uint16x4_t s0, s1;
+ do {
+ s0 = vld1_u16(src);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ src += src_stride;
+
+ vst1_u16(dst, s0);
+ dst += dst_stride;
+ vst1_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint16x8_t s0, s1;
+ do {
+ s0 = vld1q_u16(src);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ dst += dst_stride;
+ vst1q_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ src += src_stride;
+ s2 = vld1q_u16(src);
+ s3 = vld1q_u16(src + 8);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ dst += dst_stride;
+ vst1q_u16(dst, s2);
+ vst1q_u16(dst + 8, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ dst += dst_stride;
+ } while (--h != 0);
+ } else { // copy64
+ uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+ int width = w;
+ do {
+ s0 = vld1q_u16(s);
+ s1 = vld1q_u16(s + 8);
+ s2 = vld1q_u16(s + 16);
+ s3 = vld1q_u16(s + 24);
+ s4 = vld1q_u16(s + 32);
+ s5 = vld1q_u16(s + 40);
+ s6 = vld1q_u16(s + 48);
+ s7 = vld1q_u16(s + 56);
+
+ vst1q_u16(d, s0);
+ vst1q_u16(d + 8, s1);
+ vst1q_u16(d + 16, s2);
+ vst1q_u16(d + 24, s3);
+ vst1q_u16(d + 32, s4);
+ vst1q_u16(d + 40, s5);
+ vst1q_u16(d + 48, s6);
+ vst1q_u16(d + 56, s7);
+ s += 64;
+ d += 64;
+ width -= 64;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index ef2f3af0f..2e79b2ef6 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -10,6 +10,7 @@
#include <arm_neon.h>
#include <assert.h>
+#include <stdlib.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
@@ -19,75 +20,68 @@
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
-#if !AOM_ARCH_AARCH64
-static INLINE uint32x2_t horizontal_add_u16x8_v(const uint16x8_t a) {
- const uint32x4_t b = vpaddlq_u16(a);
- const uint64x2_t c = vpaddlq_u32(b);
- return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
- vreinterpret_u32_u64(vget_high_u64(c)));
-}
-#endif
+unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) {
+ const uint8x8_t s0 = load_unaligned_u8(p, stride);
+ const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride);
-unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
- const uint8x16_t b = load_unaligned_u8q(a, a_stride);
- const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-#if AOM_ARCH_AARCH64
- const uint32_t d = vaddlvq_u16(c);
- return (d + 8) >> 4;
-#else
- const uint32x2_t d = horizontal_add_u16x8_v(c);
- return vget_lane_u32(vrshr_n_u32(d, 4), 0);
-#endif
+ const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1));
+ return (sum + (1 << 3)) >> 4;
}
-unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
- uint16x8_t sum;
- uint8x8_t b = vld1_u8(a);
- a += a_stride;
- uint8x8_t c = vld1_u8(a);
- a += a_stride;
- sum = vaddl_u8(b, c);
-
- for (int i = 0; i < 6; ++i) {
- const uint8x8_t e = vld1_u8(a);
- a += a_stride;
- sum = vaddw_u8(sum, e);
- }
+unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) {
+ uint8x8_t s0 = vld1_u8(p);
+ p += stride;
+ uint8x8_t s1 = vld1_u8(p);
+ p += stride;
+ uint16x8_t acc = vaddl_u8(s0, s1);
-#if AOM_ARCH_AARCH64
- const uint32_t d = vaddlvq_u16(sum);
- return (d + 32) >> 6;
-#else
- const uint32x2_t d = horizontal_add_u16x8_v(sum);
- return vget_lane_u32(vrshr_n_u32(d, 6), 0);
-#endif
+ int i = 0;
+ do {
+ const uint8x8_t si = vld1_u8(p);
+ p += stride;
+ acc = vaddw_u8(acc, si);
+ } while (++i < 6);
+
+ const uint32_t sum = horizontal_add_u16x8(acc);
+ return (sum + (1 << 5)) >> 6;
}
void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
int *avg) {
- for (int k = 0; k < 4; k++) {
- const int x8_idx = x16_idx + ((k & 1) << 3);
- const int y8_idx = y16_idx + ((k >> 1) << 3);
- const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
- avg[k] = aom_avg_8x8_neon(s_tmp, p);
- }
+ avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p);
+ avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p);
+ avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p);
+ avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p);
}
int aom_satd_lp_neon(const int16_t *coeff, int length) {
- const int16x4_t zero = vdup_n_s16(0);
- int32x4_t accum = vdupq_n_s32(0);
+ int16x8_t s0 = vld1q_s16(coeff);
+ int16x8_t s1 = vld1q_s16(coeff + 8);
+
+ int16x8_t abs0 = vabsq_s16(s0);
+ int16x8_t abs1 = vabsq_s16(s1);
+
+ int32x4_t acc0 = vpaddlq_s16(abs0);
+ int32x4_t acc1 = vpaddlq_s16(abs1);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s16(coeff);
+ s1 = vld1q_s16(coeff + 8);
+
+ abs0 = vabsq_s16(s0);
+ abs1 = vabsq_s16(s1);
+
+ acc0 = vpadalq_s16(acc0, abs0);
+ acc1 = vpadalq_s16(acc1, abs1);
- do {
- const int16x8_t src0 = vld1q_s16(coeff);
- const int16x8_t src8 = vld1q_s16(coeff + 8);
- accum = vabal_s16(accum, vget_low_s16(src0), zero);
- accum = vabal_s16(accum, vget_high_s16(src0), zero);
- accum = vabal_s16(accum, vget_low_s16(src8), zero);
- accum = vabal_s16(accum, vget_high_s16(src8), zero);
length -= 16;
coeff += 16;
- } while (length != 0);
+ }
+ int32x4_t accum = vaddq_s32(acc0, acc1);
return horizontal_add_s32x4(accum);
}
@@ -180,56 +174,84 @@ void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
} while (h < height);
}
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
int aom_satd_neon(const tran_low_t *coeff, int length) {
const int32x4_t zero = vdupq_n_s32(0);
- int32x4_t accum = zero;
- do {
- const int32x4_t src0 = vld1q_s32(&coeff[0]);
- const int32x4_t src8 = vld1q_s32(&coeff[4]);
- const int32x4_t src16 = vld1q_s32(&coeff[8]);
- const int32x4_t src24 = vld1q_s32(&coeff[12]);
- accum = vabaq_s32(accum, src0, zero);
- accum = vabaq_s32(accum, src8, zero);
- accum = vabaq_s32(accum, src16, zero);
- accum = vabaq_s32(accum, src24, zero);
+
+ int32x4_t s0 = vld1q_s32(&coeff[0]);
+ int32x4_t s1 = vld1q_s32(&coeff[4]);
+ int32x4_t s2 = vld1q_s32(&coeff[8]);
+ int32x4_t s3 = vld1q_s32(&coeff[12]);
+
+ int32x4_t accum0 = vabsq_s32(s0);
+ int32x4_t accum1 = vabsq_s32(s2);
+ accum0 = vabaq_s32(accum0, s1, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s32(&coeff[0]);
+ s1 = vld1q_s32(&coeff[4]);
+ s2 = vld1q_s32(&coeff[8]);
+ s3 = vld1q_s32(&coeff[12]);
+
+ accum0 = vabaq_s32(accum0, s0, zero);
+ accum1 = vabaq_s32(accum1, s1, zero);
+ accum0 = vabaq_s32(accum0, s2, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
length -= 16;
coeff += 16;
- } while (length != 0);
+ }
- // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
- return horizontal_add_s32x4(accum);
+ // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+ return horizontal_add_s32x4(vaddq_s32(accum0, accum1));
}
int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
- int32x4_t v_mean = vdupq_n_s32(0);
- int32x4_t v_sse = v_mean;
- int16x8_t v_ref, v_src;
- int16x4_t v_low;
-
- int i, width = 4 << bwl;
- for (i = 0; i < width; i += 8) {
- v_ref = vld1q_s16(&ref[i]);
- v_src = vld1q_s16(&src[i]);
- const int16x8_t diff = vsubq_s16(v_ref, v_src);
- // diff: dynamic range [-510, 510], 10 bits.
- v_mean = vpadalq_s16(v_mean, diff);
- v_low = vget_low_s16(diff);
- v_sse = vmlal_s16(v_sse, v_low, v_low);
-#if AOM_ARCH_AARCH64
- v_sse = vmlal_high_s16(v_sse, diff, diff);
-#else
- const int16x4_t v_high = vget_high_s16(diff);
- v_sse = vmlal_s16(v_sse, v_high, v_high);
-#endif
- }
- const int mean = horizontal_add_s32x4(v_mean);
- const int sse = horizontal_add_s32x4(v_sse);
- const unsigned int mean_abs = mean >= 0 ? mean : -mean;
- // (mean * mean): dynamic range 31 bits.
- const int var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
- return var;
+ assert(bwl >= 2 && bwl <= 5);
+ int width = 4 << bwl;
+
+ int16x8_t r = vld1q_s16(ref);
+ int16x8_t s = vld1q_s16(src);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff = vsubq_s16(r, s);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ int16x8_t v_mean = diff;
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ int32x4_t v_sse[2];
+ v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+
+ do {
+ r = vld1q_s16(ref);
+ s = vld1q_s16(src);
+
+ diff = vsubq_s16(r, s);
+ v_mean = vaddq_s16(v_mean, diff);
+
+ v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+ } while (width != 0);
+
+ // Dynamic range [0, 65280], 16 (unsigned) bits.
+ const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean));
+ const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1]));
+
+ // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+ return sse - ((mean_abs * mean_abs) >> (bwl + 2));
}
void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
diff --git a/aom_dsp/arm/avg_pred_neon.c b/aom_dsp/arm/avg_pred_neon.c
index 04e0904cb..b17f7fca7 100644
--- a/aom_dsp/arm/avg_pred_neon.c
+++ b/aom_dsp/arm/avg_pred_neon.c
@@ -13,6 +13,9 @@
#include <assert.h>
#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/blend.h"
@@ -74,6 +77,75 @@ void aom_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
}
}
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ if (width > 8) {
+ do {
+ const uint8_t *pred_ptr = pred;
+ const uint8_t *ref_ptr = ref;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred_ptr, wtd_avg);
+
+ ref_ptr += 16;
+ pred_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ pred += width;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ int h = height / 2;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ } else {
+ int h = height / 2;
+ assert(width == 4);
+
+ do {
+ const uint8x8_t p = vld1_u8(pred);
+ const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride);
+
+ const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset),
+ vget_low_u8(bck_offset));
+
+ vst1_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 8;
+ comp_pred += 8;
+ } while (--h != 0);
+ }
+}
+
void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride,
const uint8_t *mask, int mask_stride,
@@ -84,7 +156,6 @@ void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
const int src_stride1 = invert_mask ? ref_stride : width;
if (width > 8) {
- const uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA);
do {
const uint8_t *src0_ptr = src0;
const uint8_t *src1_ptr = src1;
@@ -97,19 +168,7 @@ void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
const uint8x16_t s1 = vld1q_u8(src1_ptr);
const uint8x16_t m0 = vld1q_u8(mask_ptr);
- uint8x16_t m0_inv = vsubq_u8(max_alpha, m0);
- uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(m0));
- uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(m0));
- blend_u16_lo =
- vmlal_u8(blend_u16_lo, vget_low_u8(s1), vget_low_u8(m0_inv));
- blend_u16_hi =
- vmlal_u8(blend_u16_hi, vget_high_u8(s1), vget_high_u8(m0_inv));
-
- uint8x8_t blend_u8_lo =
- vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
- uint8x8_t blend_u8_hi =
- vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
- uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1);
vst1q_u8(comp_pred_ptr, blend_u8);
@@ -126,17 +185,12 @@ void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
comp_pred += width;
} while (--height != 0);
} else if (width == 8) {
- const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
-
do {
const uint8x8_t s0 = vld1_u8(src0);
const uint8x8_t s1 = vld1_u8(src1);
const uint8x8_t m0 = vld1_u8(mask);
- uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
- uint16x8_t blend_u16 = vmull_u8(s0, m0);
- blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
vst1_u8(comp_pred, blend_u8);
@@ -146,7 +200,6 @@ void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
comp_pred += 8;
} while (--height != 0);
} else {
- const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
int h = height / 2;
assert(width == 4);
@@ -155,10 +208,7 @@ void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1);
const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
- uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
- uint16x8_t blend_u16 = vmull_u8(s0, m0);
- blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
vst1_u8(comp_pred, blend_u8);
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index c3ee0b759..7b1b66a6a 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -12,117 +12,34 @@
#include <arm_neon.h>
#include <assert.h>
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/blend_neon.h"
#include "aom_dsp/arm/mem_neon.h"
-#include "aom_ports/mem.h"
-#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/blend.h"
-static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
- const int16x8_t v_maxval, int16x8_t *res) {
- int32x4_t im_res_low, im_res_high;
- const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
+ uint16x8_t round_offset) {
+ const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
- im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
- im_res_low =
- vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
- im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
- im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
- vget_high_s16(src_1));
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
- *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
- vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
-}
+ uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
-static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
- const CONV_BUF_TYPE *src0, uint32_t src0_stride,
- const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
- int16x8_t mask3, const int16x8_t v_maxval,
- const uint16x8_t vec_round_offset,
- const int16x8_t vec_round_bits) {
- int16x8_t src0_0, src0_1, src0_2, src0_3;
- int16x8_t src1_0, src1_1, src1_2, src1_3;
- int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
-
- load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
- &src0_3);
- load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
- &src1_3);
-
- blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
- blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
- blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
- blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
-
- uint16x8_t im_res1_0 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
- uint16x8_t im_res1_1 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
- uint16x8_t im_res1_2 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
- uint16x8_t im_res1_3 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
-
- im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
- im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
- im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
- im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
-
- vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
- vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
- vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
- vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
-}
+ uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
-static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
- const CONV_BUF_TYPE *src0, uint32_t src0_stride,
- const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
- int16x4_t mask3, const int16x8_t v_maxval,
- const uint16x8_t vec_round_offset,
- const int16x8_t vec_round_bits) {
- int16x8_t src0_0, src0_1;
- int16x8_t src1_0, src1_1;
- uint16x8_t tu0 = vdupq_n_u16(0);
- uint16x8_t tu1 = vdupq_n_u16(0);
- uint16x8_t tu2 = vdupq_n_u16(0);
- uint16x8_t tu3 = vdupq_n_u16(0);
- int16x8_t mask0_1, mask2_3;
- int16x8_t res0, res1;
-
- load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
- load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
-
- src0_0 = vreinterpretq_s16_u16(tu0);
- src0_1 = vreinterpretq_s16_u16(tu1);
-
- src1_0 = vreinterpretq_s16_u16(tu2);
- src1_1 = vreinterpretq_s16_u16(tu3);
-
- mask0_1 = vcombine_s16(mask0, mask1);
- mask2_3 = vcombine_s16(mask2, mask3);
-
- blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
- blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
-
- uint16x8_t im_res_0 =
- vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
- uint16x8_t im_res_1 =
- vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
-
- src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
- src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
-
- uint8x8_t res_0 = vqmovun_s16(src0_0);
- uint8x8_t res_1 = vqmovun_s16(src0_1);
-
- store_unaligned_u8_4x1(dst + 0 * dst_stride, res_0, 0);
- store_unaligned_u8_4x1(dst + 1 * dst_stride, res_0, 1);
- store_unaligned_u8_4x1(dst + 2 * dst_stride, res_1, 0);
- store_unaligned_u8_4x1(dst + 3 * dst_stride, res_1, 1);
+ res = vqsubq_u16(res, round_offset);
+
+ return vqrshrn_n_u16(res,
+ 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
}
void aom_lowbd_blend_a64_d16_mask_neon(
@@ -130,19 +47,13 @@ void aom_lowbd_blend_a64_d16_mask_neon(
uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
ConvolveParams *conv_params) {
- int i = 0;
+ (void)conv_params;
+
const int bd = 8;
- int w_tmp = w;
- const uint8_t *mask_tmp = mask;
- const CONV_BUF_TYPE *src0_tmp = src0;
- const CONV_BUF_TYPE *src1_tmp = src1;
- uint8_t *dst_tmp = dst;
-
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
@@ -152,294 +63,430 @@ void aom_lowbd_blend_a64_d16_mask_neon(
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- uint8x8_t s0 = vdup_n_u8(0);
- uint8x8_t s1 = vdup_n_u8(0);
- uint8x8_t s2 = vdup_n_u8(0);
- uint8x8_t s3 = vdup_n_u8(0);
- uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
- int16x8_t mask0, mask1, mask2, mask3;
- int16x8_t mask4, mask5, mask6, mask7;
- int32x4_t m0_32, m1_32, m2_32, m3_32;
- int32x4_t m4_32, m5_32, m6_32, m7_32;
- uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
- uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
- int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
- const uint16x4_t vec_zero = vdup_n_u16(0);
- const uint16_t offset = round_offset - (1 << (round_bits - 1));
- const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
- const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
- const uint16x8_t vec_offset = vdupq_n_u16(offset);
-
if (subw == 0 && subh == 0) {
- if (w_tmp > 7) {
+ if (w >= 8) {
do {
- w_tmp = w;
+ int i = 0;
do {
- load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
-
- mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
- mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
- mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
- mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 8;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (4 * mask_stride) - w;
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_unaligned_u8_4x4(mask_tmp, mask_stride, &s0, &s1);
-
- mask0 = vreinterpretq_s16_u16(vmovl_u8(s0));
- mask1 = vreinterpretq_s16_u16(vmovl_u8(s1));
-
- mask0_low = vget_low_s16(mask0);
- mask1_low = vget_high_s16(mask0);
- mask2_low = vget_low_s16(mask1);
- mask3_low = vget_high_s16(mask1);
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (4 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
} else if (subw == 1 && subh == 1) {
- if (w_tmp > 7) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
+
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if ((subw | subh) == 0) {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
do {
- w_tmp = w;
+ uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w > 8) {
+ do {
+ int i = 0;
do {
- load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
- &t7);
-
- mask0 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
- mask1 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
- mask2 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
- mask3 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
-
- mask4 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
- mask5 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
- mask6 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
- mask7 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
-
- m0_32 = vpaddlq_s16(mask0);
- m1_32 = vpaddlq_s16(mask1);
- m2_32 = vpaddlq_s16(mask2);
- m3_32 = vpaddlq_s16(mask3);
-
- m4_32 = vpaddlq_s16(mask4);
- m5_32 = vpaddlq_s16(mask5);
- m6_32 = vpaddlq_s16(mask6);
- m7_32 = vpaddlq_s16(mask7);
-
- mask0 =
- vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
- mask1 =
- vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
- mask2 =
- vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
- mask3 =
- vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 16;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (8 * mask_stride) - (2 * w);
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
+ uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
-
- mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
- mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
- mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
-
- m0_32 = vpaddlq_s16(mask0);
- m1_32 = vpaddlq_s16(mask1);
- m2_32 = vpaddlq_s16(mask2);
- m3_32 = vpaddlq_s16(mask3);
-
- mask0_low = vqrshrn_n_s32(m0_32, 2);
- mask1_low = vqrshrn_n_s32(m1_32, 2);
- mask2_low = vqrshrn_n_s32(m2_32, 2);
- mask3_low = vqrshrn_n_s32(m3_32, 2);
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (8 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
} else if (subw == 1 && subh == 0) {
- if (w_tmp > 7) {
+ if (w > 8) {
do {
- w_tmp = w;
+ int i = 0;
+
do {
- load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
-
- mask0 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
- mask1 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
- mask2 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
- mask3 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
-
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
- mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
- mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
- w_tmp -= 8;
- mask_tmp += 16;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (4 * mask_stride) - (2 * w);
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ uint8x16_t m0 = vld1q_u8(mask + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t m1 = vld1_u8(mask + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l);
-
- mask0 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
- mask1 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
- mask2 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
- mask3 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
-
- mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
- mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
- mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
- mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (4 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
} else {
- if (w_tmp > 7) {
+ if (w > 8) {
do {
- w_tmp = w;
+ int i = 0;
do {
- load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
-
- mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
- mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
- mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
-
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
- mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
- mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 8;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (8 * mask_stride) - w;
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &s0, &s1);
- load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &s2,
- &s3);
-
- mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
-
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-
- mask0_low = vget_low_s16(mask0);
- mask1_low = vget_high_s16(mask0);
- mask2_low = vget_low_s16(mask1);
- mask3_low = vget_high_s16(mask1);
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (8 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
}
}
diff --git a/aom_dsp/arm/blend_neon.h b/aom_dsp/arm/blend_neon.h
new file mode 100644
index 000000000..c8a03224e
--- /dev/null
+++ b/aom_dsp/arm/blend_neon.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_
+#define AOM_AOM_DSP_ARM_BLEND_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/blend.h"
+
+static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+ uint8x16_t b) {
+ const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a));
+ uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a));
+
+ blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b));
+ blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b));
+
+ uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u8(blend_u8_lo, blend_u8_hi);
+}
+
+static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+ uint8x8_t b) {
+ const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16 = vmull_u8(m, a);
+
+ blend_u16 = vmlal_u8(blend_u16, m_inv, b);
+
+ return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+ uint16x8_t b) {
+ uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m));
+
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv));
+
+ uint16x4_t blend_u16_lo =
+ vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi =
+ vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u16(blend_u16_lo, blend_u16_hi);
+}
+
+static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+ uint16x4_t b) {
+ const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u16 = vmull_u16(m, a);
+
+ blend_u16 = vmlal_u16(blend_u16, m_inv, b);
+
+ return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrhadd_u8(a, b);
+}
+
+static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+ return vrhaddq_u8(a, b);
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrshr_n_u8(vpadd_u8(a, b), 1);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+#if AOM_ARCH_AARCH64
+ return vrshrq_n_u8(vpaddq_u8(a, b), 1);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1);
+#endif // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+ uint8x8_t c, uint8x8_t d) {
+ uint8x8_t a_c = vpadd_u8(a, c);
+ uint8x8_t b_d = vpadd_u8(b, d);
+ return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+ uint8x16_t c,
+ uint8x16_t d) {
+#if AOM_ARCH_AARCH64
+ uint8x16_t a_c = vpaddq_u8(a, c);
+ uint8x16_t b_d = vpaddq_u8(b, d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c));
+ uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d));
+ uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c);
+ uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#endif // AOM_ARCH_AARCH64
+}
+
+#endif // AOM_AOM_DSP_ARM_BLEND_NEON_H_
diff --git a/aom_dsp/arm/blk_sse_sum_neon.c b/aom_dsp/arm/blk_sse_sum_neon.c
new file mode 100644
index 000000000..f2ada93e9
--- /dev/null
+++ b/aom_dsp/arm/blk_sse_sum_neon.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ do {
+ int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += 2 * stride;
+ i -= 2;
+ } while (i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t and the maximum
+ // value for bh is 32, we don't have to worry about sse overflowing.
+
+ do {
+ int16x8_t d = vld1q_s16(data);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += stride;
+ } while (--i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+ int bw, int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum = vdupq_n_s32(0);
+ int64x2_t sse = vdupq_n_s64(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t vector that
+ // means we can process up to (127*4)/bw rows before we need to widen to
+ // 64 bits.
+
+ int i_limit = (127 * 4) / bw;
+ int i_tmp = bh > i_limit ? i_limit : bh;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ do {
+ int j = bw;
+ const int16_t *data_ptr = data;
+ do {
+ int16x8_t d = vld1q_s16(data_ptr);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d));
+
+ data_ptr += 8;
+ j -= 8;
+ } while (j != 0);
+
+ data += stride;
+ i++;
+ } while (i < i_tmp && i < bh);
+
+ sse = vpadalq_s32(sse, sse_s32);
+ i_tmp += i_limit;
+ } while (i < bh);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_add_s64x2(sse);
+}
+
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ if (bw == 4) {
+ get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else if (bw == 8) {
+ get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else {
+ assert(bw % 8 == 0);
+ get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/aom_dsp/arm/dist_wtd_avg_neon.h b/aom_dsp/arm/dist_wtd_avg_neon.h
new file mode 100644
index 000000000..19c9b04c5
--- /dev/null
+++ b/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+ uint8x8_t wta, uint8x8_t wtb) {
+ uint16x8_t wtd_sum = vmull_u8(a, wta);
+
+ wtd_sum = vmlal_u8(wtd_sum, b, wtb);
+
+ return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+ uint16x4_t wta, uint16x4_t wtb) {
+ uint32x4_t wtd_sum = vmull_u16(a, wta);
+
+ wtd_sum = vmlal_u16(wtd_sum, b, wtb);
+
+ return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+ uint8x16_t wta, uint8x16_t wtb) {
+ uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
+ uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
+
+ wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
+ wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
+
+ uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
+}
+
+static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+ uint16x8_t wta, uint16x8_t wtb) {
+ uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
+ uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
+
+ wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
+ wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
+
+ uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
+}
+
+#endif // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index a7d66b3dc..fb4cda723 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -48,8 +48,8 @@ static void aom_fdct4x4_helper(const int16_t *input, int stride,
// Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
- const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
- const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64);
// fdct_round_shift
int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
@@ -57,11 +57,13 @@ static void aom_fdct4x4_helper(const int16_t *input, int stride,
// s_3 * cospi_8_64 + s_2 * cospi_24_64
// s_3 * cospi_24_64 - s_2 * cospi_8_64
- const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
- const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64);
- const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
- const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+ const int32x4_t temp3 =
+ vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64);
+ const int32x4_t temp4 =
+ vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64);
// fdct_round_shift
int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
@@ -69,7 +71,7 @@ static void aom_fdct4x4_helper(const int16_t *input, int stride,
// Only transpose the first pass
if (i == 0) {
- transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+ transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3);
}
*input_0 = out_0;
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 82ce0cdf0..d0f59227d 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -37,7 +37,7 @@ void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride,
hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
- transpose_s16_4x4d(&a0, &a1, &a2, &a3);
+ transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
@@ -91,7 +91,7 @@ void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
- transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
@@ -120,7 +120,7 @@ void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
- transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
@@ -196,56 +196,90 @@ void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
/* Bottom right. */
aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+ // Each iteration of the loop operates on entire rows (16 samples each)
+ // because we need to swap the second and third quarters of every row in the
+ // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
+ // loop at the end of aom_hadamard_16x16_c.
for (int i = 0; i < 64; i += 16) {
- const int16x8_t a00 = load_tran_low_to_s16q(coeff + 0);
- const int16x8_t a01 = load_tran_low_to_s16q(coeff + 64);
- const int16x8_t a02 = load_tran_low_to_s16q(coeff + 128);
- const int16x8_t a03 = load_tran_low_to_s16q(coeff + 192);
-
- const int16x8_t b00 = vhaddq_s16(a00, a01);
- const int16x8_t b01 = vhsubq_s16(a00, a01);
- const int16x8_t b02 = vhaddq_s16(a02, a03);
- const int16x8_t b03 = vhsubq_s16(a02, a03);
-
- const int16x8_t c00 = vaddq_s16(b00, b02);
- const int16x8_t c01 = vaddq_s16(b01, b03);
- const int16x8_t c02 = vsubq_s16(b00, b02);
- const int16x8_t c03 = vsubq_s16(b01, b03);
-
- const int16x8_t a10 = load_tran_low_to_s16q(coeff + 8 + 0);
- const int16x8_t a11 = load_tran_low_to_s16q(coeff + 8 + 64);
- const int16x8_t a12 = load_tran_low_to_s16q(coeff + 8 + 128);
- const int16x8_t a13 = load_tran_low_to_s16q(coeff + 8 + 192);
-
- const int16x8_t b10 = vhaddq_s16(a10, a11);
- const int16x8_t b11 = vhsubq_s16(a10, a11);
- const int16x8_t b12 = vhaddq_s16(a12, a13);
- const int16x8_t b13 = vhsubq_s16(a12, a13);
-
- const int16x8_t c10 = vaddq_s16(b10, b12);
- const int16x8_t c11 = vaddq_s16(b11, b13);
- const int16x8_t c12 = vsubq_s16(b10, b12);
- const int16x8_t c13 = vsubq_s16(b11, b13);
-
- store_s16_to_tran_low(coeff + 0 + 0, vget_low_s16(c00));
- store_s16_to_tran_low(coeff + 0 + 4, vget_low_s16(c10));
- store_s16_to_tran_low(coeff + 0 + 8, vget_high_s16(c00));
- store_s16_to_tran_low(coeff + 0 + 12, vget_high_s16(c10));
-
- store_s16_to_tran_low(coeff + 64 + 0, vget_low_s16(c01));
- store_s16_to_tran_low(coeff + 64 + 4, vget_low_s16(c11));
- store_s16_to_tran_low(coeff + 64 + 8, vget_high_s16(c01));
- store_s16_to_tran_low(coeff + 64 + 12, vget_high_s16(c11));
-
- store_s16_to_tran_low(coeff + 128 + 0, vget_low_s16(c02));
- store_s16_to_tran_low(coeff + 128 + 4, vget_low_s16(c12));
- store_s16_to_tran_low(coeff + 128 + 8, vget_high_s16(c02));
- store_s16_to_tran_low(coeff + 128 + 12, vget_high_s16(c12));
-
- store_s16_to_tran_low(coeff + 192 + 0, vget_low_s16(c03));
- store_s16_to_tran_low(coeff + 192 + 4, vget_low_s16(c13));
- store_s16_to_tran_low(coeff + 192 + 8, vget_high_s16(c03));
- store_s16_to_tran_low(coeff + 192 + 12, vget_high_s16(c13));
+ const int32x4_t a00 = vld1q_s32(coeff + 0);
+ const int32x4_t a01 = vld1q_s32(coeff + 64);
+ const int32x4_t a02 = vld1q_s32(coeff + 128);
+ const int32x4_t a03 = vld1q_s32(coeff + 192);
+
+ const int32x4_t b00 = vhaddq_s32(a00, a01);
+ const int32x4_t b01 = vhsubq_s32(a00, a01);
+ const int32x4_t b02 = vhaddq_s32(a02, a03);
+ const int32x4_t b03 = vhsubq_s32(a02, a03);
+
+ const int32x4_t c00 = vaddq_s32(b00, b02);
+ const int32x4_t c01 = vaddq_s32(b01, b03);
+ const int32x4_t c02 = vsubq_s32(b00, b02);
+ const int32x4_t c03 = vsubq_s32(b01, b03);
+
+ const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
+ const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
+ const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
+ const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
+
+ const int32x4_t b10 = vhaddq_s32(a10, a11);
+ const int32x4_t b11 = vhsubq_s32(a10, a11);
+ const int32x4_t b12 = vhaddq_s32(a12, a13);
+ const int32x4_t b13 = vhsubq_s32(a12, a13);
+
+ const int32x4_t c10 = vaddq_s32(b10, b12);
+ const int32x4_t c11 = vaddq_s32(b11, b13);
+ const int32x4_t c12 = vsubq_s32(b10, b12);
+ const int32x4_t c13 = vsubq_s32(b11, b13);
+
+ const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
+ const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
+ const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
+ const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
+
+ const int32x4_t b20 = vhaddq_s32(a20, a21);
+ const int32x4_t b21 = vhsubq_s32(a20, a21);
+ const int32x4_t b22 = vhaddq_s32(a22, a23);
+ const int32x4_t b23 = vhsubq_s32(a22, a23);
+
+ const int32x4_t c20 = vaddq_s32(b20, b22);
+ const int32x4_t c21 = vaddq_s32(b21, b23);
+ const int32x4_t c22 = vsubq_s32(b20, b22);
+ const int32x4_t c23 = vsubq_s32(b21, b23);
+
+ const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
+ const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
+ const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
+ const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
+
+ const int32x4_t b30 = vhaddq_s32(a30, a31);
+ const int32x4_t b31 = vhsubq_s32(a30, a31);
+ const int32x4_t b32 = vhaddq_s32(a32, a33);
+ const int32x4_t b33 = vhsubq_s32(a32, a33);
+
+ const int32x4_t c30 = vaddq_s32(b30, b32);
+ const int32x4_t c31 = vaddq_s32(b31, b33);
+ const int32x4_t c32 = vsubq_s32(b30, b32);
+ const int32x4_t c33 = vsubq_s32(b31, b33);
+
+ vst1q_s32(coeff + 0 + 0, c00);
+ vst1q_s32(coeff + 0 + 4, c20);
+ vst1q_s32(coeff + 0 + 8, c10);
+ vst1q_s32(coeff + 0 + 12, c30);
+
+ vst1q_s32(coeff + 64 + 0, c01);
+ vst1q_s32(coeff + 64 + 4, c21);
+ vst1q_s32(coeff + 64 + 8, c11);
+ vst1q_s32(coeff + 64 + 12, c31);
+
+ vst1q_s32(coeff + 128 + 0, c02);
+ vst1q_s32(coeff + 128 + 4, c22);
+ vst1q_s32(coeff + 128 + 8, c12);
+ vst1q_s32(coeff + 128 + 12, c32);
+
+ vst1q_s32(coeff + 192 + 0, c03);
+ vst1q_s32(coeff + 192 + 4, c23);
+ vst1q_s32(coeff + 192 + 8, c13);
+ vst1q_s32(coeff + 192 + 12, c33);
coeff += 16;
}
diff --git a/aom_dsp/arm/highbd_avg_pred_neon.c b/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 000000000..531309b02
--- /dev/null
+++ b/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ int i = height;
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t avg = vrhadd_u16(p, r);
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int src_stride0 = invert_mask ? width : ref_stride;
+ const int src_stride1 = invert_mask ? ref_stride : width;
+
+ if (width >= 8) {
+ do {
+ int j = 0;
+
+ do {
+ const uint16x8_t s0 = vld1q_u16(src0 + j);
+ const uint16x8_t s1 = vld1q_u16(src1 + j);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(comp_pred + j, blend_u16);
+
+ j += 8;
+ } while (j < width);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+
+ do {
+ const uint16x4_t s0 = vld1_u16(src0);
+ const uint16x4_t s1 = vld1_u16(src1);
+ const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ vst1_u16(comp_pred, blend_u16);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += 4;
+ } while (--height != 0);
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ const uint16x4_t avg = dist_wtd_avg_u16x4(
+ r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
new file mode 100644
index 000000000..bdd2177c4
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask));
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ const uint16x4_t m0 =
+ vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask)));
+ do {
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
new file mode 100644
index 000000000..36d763a5c
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+#define HBD_BLEND_A64_D16_MASK(bd, round0_bits) \
+ static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8( \
+ uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) { \
+ const uint16x8_t m_inv = \
+ vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); \
+ \
+ uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_low_u16(m), vget_low_u16(a)); \
+ uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_high_u16(m), vget_high_u16(a)); \
+ \
+ blend_u32_lo = \
+ vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); \
+ blend_u32_hi = \
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); \
+ \
+ uint16x4_t blend_u16_lo = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ uint16x4_t blend_u16_hi = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ \
+ uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi); \
+ blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1)); \
+ \
+ return blend_u16; \
+ } \
+ \
+ static INLINE void highbd_##bd##_blend_a64_d16_mask_neon( \
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, \
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, \
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, \
+ int subh) { \
+ const int offset_bits = bd + 2 * FILTER_BITS - round0_bits; \
+ int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + \
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); \
+ int32x4_t offset = \
+ vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS)); \
+ \
+ if ((subw | subh) == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if ((subw & subh) == 1) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); \
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4( \
+ vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0), \
+ vget_high_u8(m1))); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); \
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = \
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if (subw == 1 && subh == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 2 * i); \
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0_2 = \
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \
+ uint8x8_t m1_3 = \
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } \
+ }
+
+// 12 bitdepth
+HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2))
+// 10 bitdepth
+HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS)
+// 8 bitdepth
+HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS)
+
+void aom_highbd_blend_a64_d16_mask_neon(
+ uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ (void)conv_params;
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ if (bd == 12) {
+ highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else if (bd == 10) {
+ highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else {
+ highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh);
+ }
+}
+
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h, int subw, int subh, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if ((subw | subh) == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
new file mode 100644
index 000000000..ea3d655a9
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0]));
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 1;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]);
+ uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]);
+ uint16x8_t m = vcombine_u16(m1, m2);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ do {
+ uint16x4_t m0 = vdup_n_u16(0);
+ m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+ uint8x8_t m0_zip =
+ vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+ m0 = vget_low_u16(vmovl_u8(m0_zip));
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
new file mode 100644
index 000000000..e25438c9b
--- /dev/null
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ int32x4_t sum =
+ highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE int32x4_t highbd_convolve8_horiz4_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ const int16x8_t s2 = vextq_s16(s0, s1, 1);
+ const int16x8_t s3 = vextq_s16(s0, s1, 2);
+ const int16x8_t s4 = vextq_s16(s0, s1, 3);
+ const int16x4_t s0_lo = vget_low_s16(s0);
+ const int16x4_t s1_lo = vget_low_s16(s2);
+ const int16x4_t s2_lo = vget_low_s16(s3);
+ const int16x4_t s3_lo = vget_low_s16(s4);
+ const int16x4_t s4_lo = vget_high_s16(s0);
+ const int16x4_t s5_lo = vget_high_s16(s2);
+ const int16x4_t s6_lo = vget_high_s16(s3);
+ const int16x4_t s7_lo = vget_high_s16(s4);
+
+ return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
+ s7_lo, x_filter_0_7);
+}
+
+static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ int32x4_t *sum0, int32x4_t *sum1) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
+ const int16x8_t s0_hi,
+ const int16x8_t x_filter_0_7,
+ int32x4_t *sum0,
+ int32x4_t *sum1) {
+ const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
+ const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
+ const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
+ const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
+ const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
+ const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
+ const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
+
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
+ sum1);
+}
+
+static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum0, sum1;
+ highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+ int32x4_t sum0;
+ int32x4_t sum1;
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
+ &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *x_filter_ptr,
+ int x_step_q4, int w, int h, int bd) {
+ assert(w >= 4 && h >= 4);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0, s1, s2, s3;
+ load_s16_8x2(s, src_stride, &s0, &s2);
+ load_s16_8x2(s + 8, src_stride, &s1, &s3);
+
+ uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
+ uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ d01 = vminq_u16(d01, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+
+ s += 2 * src_stride;
+ d += 2 * dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int x_q4 = 0;
+
+ const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
+ int16x8_t s0, s2, s4, s6;
+ load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
+ src_x += 8;
+
+ do {
+ int16x8_t s1, s3, s5, s7;
+ load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+
+ uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
+ uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
+ uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
+ uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s1;
+ s2 = s3;
+ s4 = s5;
+ s6 = s7;
+ src_x += 8;
+ d += 8;
+ width -= 8;
+ x_q4 += 8 * x_step_q4;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ }
+}
+
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_y;
+ (void)y_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= SUBPEL_TAPS / 2 - 1;
+ highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, w, h, bd);
+ }
+}
+
+static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *y_filter_ptr, int w, int h,
+ int bd) {
+ assert(w >= 4 && h >= 4);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x4_t d1 =
+ highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x4_t d2 =
+ highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x4_t d3 =
+ highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ uint16x8_t d23 = vcombine_u16(d2, d3);
+
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+ vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
+ vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
+ s7, y_filter);
+ uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
+ s8, y_filter);
+ uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
+ s9, y_filter);
+ uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
+ s10, y_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_x;
+ (void)x_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+ highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
+ bd);
+ }
+}
diff --git a/aom_dsp/arm/highbd_hadamard_neon.c b/aom_dsp/arm/highbd_hadamard_neon.c
index aad20468b..d28617c67 100644
--- a/aom_dsp/arm/highbd_hadamard_neon.c
+++ b/aom_dsp/arm/highbd_hadamard_neon.c
@@ -109,7 +109,7 @@ void aom_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
// For the first pass we can stay in 16-bit elements (4095*8 = 32760).
hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
// For the second pass we need to widen to 32-bit elements, so we're
// processing 4 columns at a time.
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 63f53c37d..366ca3f04 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -15,6 +15,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/intrapred_common.h"
// -----------------------------------------------------------------------------
@@ -191,7 +192,7 @@ static INLINE int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \
uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \
uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \
- int sum = horizontal_add_and_broadcast_long_u16x8(sum_vec)[0]; \
+ int sum = horizontal_add_u16x8(sum_vec); \
int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \
}
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 2b5128ea8..77727b766 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -298,7 +298,7 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
vld1_u16(dst_q1) };
- transpose_u16_4x4(src);
+ transpose_array_inplace_u16_4x4(src);
// Adjust thresholds to bitdepth.
const int outer_thresh = *blimit << (bd - 8);
@@ -344,7 +344,7 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
vget_high_u16(p0q0_output),
vget_high_u16(p1q1_output),
};
- transpose_u16_4x4(output);
+ transpose_array_inplace_u16_4x4(output);
vst1_u16(dst_p1, output[0]);
vst1_u16(dst_p0, output[1]);
@@ -386,7 +386,7 @@ static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
// p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
// ^^^^^^ ^^^^^^
// Should dual issue with the left shift.
- const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
sum = vaddq_u16(sum, outer_sum);
@@ -401,7 +401,7 @@ static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
// p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
// ^^^^^^^^
sum = vsubq_u16(sum, p2q2_double);
- const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
*p0q0_output = vrshrq_n_u16(sum, 3);
@@ -505,7 +505,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
// and src_raw[3] after transpose.
uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
vld1q_u16(dst_2), vld1q_u16(dst_3) };
- transpose_u16_4x8q(src_raw);
+ transpose_array_inplace_u16_4x8(src_raw);
// p2, p1, p0, q0, q1, q2
const uint16x4_t src[6] = {
vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
@@ -574,7 +574,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
vget_high_u16(p0q0_output),
vget_high_u16(p1q1_output),
};
- transpose_u16_4x4(output);
+ transpose_array_inplace_u16_4x4(output);
// dst_n starts at p2, so adjust to p1.
vst1_u16(dst_0 + 1, output[0]);
@@ -626,7 +626,7 @@ static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
// ^^^^^^
- const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
sum = vaddq_u16(sum, q0p0);
*p2q2_output = vrshrq_n_u16(sum, 3);
@@ -635,7 +635,7 @@ static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
// p1 = p2 - p3 - p2 + p1 + q1
// q1 = q2 - q3 - q2 + q0 + p1
sum = vsubq_u16(sum, p23q23);
- const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
*p1q1_output = vrshrq_n_u16(sum, 3);
@@ -644,7 +644,7 @@ static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
// p0 = p1 - p3 - p1 + p0 + q2
// q0 = q1 - q3 - q1 + q0 + p2
sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
- const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
*p0q0_output = vrshrq_n_u16(sum, 3);
@@ -827,7 +827,7 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
// After transpose, |output| will contain rows of the form:
// p0 p1 p2 p3 q0 q1 q2 q3
- transpose_u16_4x8q(output);
+ transpose_array_inplace_u16_4x8(output);
// Reverse p values to produce original order:
// p3 p2 p1 p0 q0 q1 q2 q3
@@ -883,7 +883,7 @@ static INLINE void filter14(
// ^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^
- const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
sum = vaddq_u16(sum, q0p0);
*p5q5_output = vrshrq_n_u16(sum, 4);
@@ -892,7 +892,7 @@ static INLINE void filter14(
// p4 = p5 - (2 * p6) + p3 + q1
// q4 = q5 - (2 * q6) + q3 + p1
sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
- const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
*p4q4_output = vrshrq_n_u16(sum, 4);
@@ -901,7 +901,7 @@ static INLINE void filter14(
// p3 = p4 - p6 - p5 + p2 + q2
// q3 = q4 - q6 - q5 + q2 + p2
sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
- const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
*p3q3_output = vrshrq_n_u16(sum, 4);
@@ -910,7 +910,7 @@ static INLINE void filter14(
// p2 = p3 - p6 - p4 + p1 + q3
// q2 = q3 - q6 - q4 + q1 + p3
sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
- const uint16x8_t q3p3 = transpose64_u16q(p3q3);
+ const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4);
sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
*p2q2_output = vrshrq_n_u16(sum, 4);
@@ -919,7 +919,7 @@ static INLINE void filter14(
// p1 = p2 - p6 - p3 + p0 + q4
// q1 = q2 - q6 - q3 + q0 + p4
sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
- const uint16x8_t q4p4 = transpose64_u16q(p4q4);
+ const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4);
sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
*p1q1_output = vrshrq_n_u16(sum, 4);
@@ -928,7 +928,7 @@ static INLINE void filter14(
// p0 = p1 - p6 - p2 + q0 + q5
// q0 = q1 - q6 - q2 + p0 + p5
sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
- const uint16x8_t q5p5 = transpose64_u16q(p5q5);
+ const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4);
sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
*p0q0_output = vrshrq_n_u16(sum, 4);
@@ -1118,14 +1118,14 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
vld1q_u16(dst_3) };
// p7 will be the low half of src_p[0]. Not used until the end.
- transpose_u16_4x8q(src_p);
+ transpose_array_inplace_u16_4x8(src_p);
// Low halves: q0 q1 q2 q3
// High halves: q4 q5 q6 q7
uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
// q7 will be the high half of src_q[3]. Not used until the end.
- transpose_u16_4x8q(src_q);
+ transpose_array_inplace_u16_4x8(src_q);
// Adjust thresholds to bitdepth.
const int outer_thresh = *blimit << (bd - 8);
@@ -1238,10 +1238,10 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
- transpose_u16_4x8q(output_p);
+ transpose_array_inplace_u16_4x8(output_p);
uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
- transpose_u16_4x8q(output_q);
+ transpose_array_inplace_u16_4x8(output_q);
// Reverse p values to produce original order:
// p3 p2 p1 p0 q0 q1 q2 q3
diff --git a/aom_dsp/arm/highbd_masked_sad_neon.c b/aom_dsp/arm/highbd_masked_sad_neon.c
new file mode 100644
index 000000000..9262d818e
--- /dev/null
+++ b/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ const uint16x8_t s0 = vld1q_u16(src);
+ const uint16x8_t a0 = vld1q_u16(a);
+ const uint16x8_t b0 = vld1q_u16(b);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(m));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0);
+
+ return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
+}
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+ return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
+}
+
+static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+ return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+}
+
+static INLINE unsigned int masked_sad_128xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+ sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]);
+ sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]);
+ sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]);
+ height -= 4;
+ } while (height != 0);
+
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]);
+ sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]);
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]);
+
+ return horizontal_add_u32x4(sad_u32[0]);
+}
+
+static INLINE unsigned int masked_sad_64xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
+}
+
+static INLINE unsigned int masked_sad_32xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad = vdupq_n_u16(0);
+ for (int h = 0; h < 4; ++h) {
+ sad = masked_sad_32x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+static INLINE unsigned int masked_sad_16xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 8; ++h) {
+ sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 8;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE unsigned int masked_sad_8xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 16; ++h) {
+ sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 16;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE unsigned int masked_sad_16xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 8 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 8);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_8xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_4xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 64 elements in the
+ // uint16x4_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ uint16x4_t sad = vdup_n_u16(0);
+ do {
+ uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m)));
+ uint16x4_t a0 = load_unaligned_u16_4x1(a);
+ uint16x4_t b0 = load_unaligned_u16_4x1(b);
+ uint16x4_t s0 = load_unaligned_u16_4x1(src);
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0);
+
+ sad = vadd_u16(sad, vabd_u16(blend_u16, s0));
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_sad_neon.c b/aom_dsp/arm/highbd_obmc_sad_neon.c
new file mode 100644
index 000000000..28699e6f4
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+ const int32_t *mask,
+ const int32_t *wsrc,
+ uint32x4_t *sum) {
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
+
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+ int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+ *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height / 2;
+ do {
+ uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += 2 * ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t r = vld1q_u16(ref_ptr);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int width, int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+
+ uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+
+ wsrc += 16;
+ mask += 16;
+ i += 16;
+ } while (i < width);
+
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+ uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
+ uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
+
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+ highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
+ highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
+
+ wsrc += 32;
+ mask += 32;
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+#define HIGHBD_OBMC_SAD_WXH_NEON(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_neon( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+ }
+
+HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
+
+HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_variance_neon.c b/aom_dsp/arm/highbd_obmc_variance_neon.c
new file mode 100644
index 000000000..d59224619
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ uint32x4_t *sse,
+ int32x4_t *sum) {
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(pre);
+ int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+
+ int32x4_t mask_lo = vld1q_s32(&mask[0]);
+ int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+ int32x4_t diff_hi =
+ vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+ diff_lo = vsubq_s32(wsrc_lo, diff_lo);
+ diff_hi = vsubq_s32(wsrc_hi, diff_hi);
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+ // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+ // This difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+ diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+ int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+ int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+ *sum = vaddq_s32(*sum, round_lo);
+ *sum = vaddq_s32(*sum, round_hi);
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo),
+ vreinterpretq_u32_s32(round_lo));
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi),
+ vreinterpretq_u32_s32(round_hi));
+}
+
+// For 12-bit data, we can only accumulate up to 256 elements in the unsigned
+// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
+// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
+// 128x64, 128x128 are processed in a different helper function.
+static INLINE void highbd_obmc_variance_xlarge_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
+ int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ do {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ int j = 0;
+
+ do {
+ int i = 0;
+
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0],
+ &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1],
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ j++;
+ } while (j < h_tmp);
+
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]);
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]);
+ h -= h_tmp;
+ } while (h != 0);
+
+ *sse = horizontal_add_u64x2(sse_u64);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_large_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32,
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = vld1q_u16(pre);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += pre_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ assert(h % 2 == 0);
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += 2 * pre_stride;
+ wsrc += 8;
+ mask += 8;
+ h -= 2;
+ } while (h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \
+ &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \
+ &sse64, &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+// 8-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8)
+
+// 10-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10)
+
+// 12-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12)
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index 77a7aaca3..6149c9f13 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -11,14 +11,11 @@
#include <arm_neon.h>
#include <assert.h>
+#include <string.h>
#include "config/aom_config.h"
#include "aom_dsp/quantize.h"
-#include "aom_dsp/arm/mem_neon.h"
-
-#include "av1/common/quant_common.h"
-#include "av1/encoder/av1_quantize.h"
static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
#if AOM_ARCH_AARCH64
@@ -83,6 +80,7 @@ static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
return vmaxq_s16(v_eobmax, v_nz_iscan);
}
+#if !CONFIG_REALTIME_ONLY
static INLINE void get_min_max_lane_eob(const int16_t *iscan,
int16x8_t *v_eobmin,
int16x8_t *v_eobmax, uint16x8_t v_mask,
@@ -91,13 +89,14 @@ static INLINE void get_min_max_lane_eob(const int16_t *iscan,
const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
#if SKIP_EOB_FACTOR_ADJUST
const int16x8_t v_nz_iscan_min =
- vbslq_s16(v_mask, v_iscan, vdupq_n_s16(n_coeffs));
+ vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
*v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
#else
(void)v_eobmin;
#endif
*v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
}
+#endif // !CONFIG_REALTIME_ONLY
static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
#if AOM_ARCH_AARCH64
@@ -117,6 +116,7 @@ static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
#endif
}
+#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
#if AOM_ARCH_AARCH64
return (uint16_t)vminvq_s16(v_eobmin);
@@ -134,6 +134,7 @@ static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
#endif
}
+#endif // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
static void highbd_quantize_b_neon(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
@@ -298,7 +299,7 @@ static void highbd_quantize_b_adaptive_neon(
int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
uint16x4_t v_mask_lo, v_mask_hi;
int16x8_t v_eobmax = vdupq_n_s16(-1);
- int16x8_t v_eobmin = vdupq_n_s16(n_coeffs);
+ int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
assert(n_coeffs > 8);
// Pre-scan pass
diff --git a/aom_dsp/arm/highbd_sad_neon.c b/aom_dsp/arm/highbd_sad_neon.c
index 919eb551e..d51f639de 100644
--- a/aom_dsp/arm/highbd_sad_neon.c
+++ b/aom_dsp/arm/highbd_sad_neon.c
@@ -61,6 +61,7 @@ static INLINE uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
return horizontal_add_u16x8(sum);
}
+#if !CONFIG_REALTIME_ONLY
static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -82,6 +83,7 @@ static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
return horizontal_add_u32x4(sum_u32);
}
+#endif // !CONFIG_REALTIME_ONLY
static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
int src_stride,
@@ -283,3 +285,225 @@ HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8)
HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ uint16x4_t p = vld1_u16(pred16_ptr);
+
+ uint16x4_t avg = vrhadd_u16(r, p);
+ sum = vabal_u16(sum, s, avg);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 4;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t p = vld1q_u16(pred16_ptr);
+
+ uint16x8_t avg = vrhaddq_u16(r, p);
+ uint16x8_t diff = vabdq_u16(s, avg);
+ sum = vpadalq_u16(sum, diff);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1, p0, p1;
+ uint16x8_t avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ p0 = vld1q_u16(pred16_ptr);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ p1 = vld1q_u16(pred16_ptr + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ p0 = vld1q_u16(pred16_ptr + j);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ p1 = vld1q_u16(pred16_ptr + j + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ p2 = vld1q_u16(pred16_ptr + j + 16);
+ avg2 = vrhaddq_u16(r2, p2);
+ diff2 = vabdq_u16(s2, avg2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ p3 = vld1q_u16(pred16_ptr + j + 24);
+ avg3 = vrhaddq_u16(r3, p3);
+ diff3 = vabdq_u16(s3, avg3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += w;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h) \
+ uint32_t aom_highbd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+HBD_SAD_WXH_AVG_NEON(64, 128)
+
+HBD_SAD_WXH_AVG_NEON(128, 64)
+HBD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_AVG_NEON(4, 16)
+
+HBD_SAD_WXH_AVG_NEON(8, 32)
+
+HBD_SAD_WXH_AVG_NEON(16, 4)
+HBD_SAD_WXH_AVG_NEON(16, 64)
+
+HBD_SAD_WXH_AVG_NEON(32, 8)
+
+HBD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sad4d_neon.c b/aom_dsp/arm/highbd_sadxd_neon.c
index f2fda3646..85ca6732a 100644
--- a/aom_dsp/arm/highbd_sad4d_neon.c
+++ b/aom_dsp/arm/highbd_sadxd_neon.c
@@ -90,6 +90,7 @@ static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
*sad_sum = vpadalq_u16(*sad_sum, abs_diff);
}
+#if !CONFIG_REALTIME_ONLY
static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *const ref_ptr[4],
@@ -116,6 +117,7 @@ static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
vst1q_u32(res, horizontal_add_4d_u32x4(sum));
}
+#endif // !CONFIG_REALTIME_ONLY
static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
int src_stride,
@@ -358,3 +360,258 @@ HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0]));
+ res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1]));
+ res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2]));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride);
+ uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride);
+
+ sad8_neon(s, r0, &sum[0]);
+ sad8_neon(s, r1, &sum[1]);
+ sad8_neon(s, r2, &sum[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+ res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+ res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum[3];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad128xhx3d_large_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+ 128, h);
+}
+
+static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+ h);
+}
+
+static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_3D_SMALL_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+#define HBD_SAD_WXH_3D_LARGE_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_3D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_3D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_3D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_3D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sse_neon.c b/aom_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 000000000..184e9f9be
--- /dev/null
+++ b/aom_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+ const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[16];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[4];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2];
+ highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ // Peel the first loop iteration.
+ uint16x4_t s = vld1_u16(src);
+ uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t abs_diff = vabd_u16(s, r);
+ uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ s = vld1_u16(src);
+ r = vld1_u16(ref);
+
+ abs_diff = vabd_u16(s, r);
+ sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int width, int height) {
+ // { 0, 1, 2, 3, 4, 5, 6, 7 }
+ uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+ uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+ uint64_t sse = 0;
+
+ do {
+ int w = width;
+ int offset = 0;
+
+ do {
+ uint16x8_t s = vld1q_u16(src + offset);
+ uint16x8_t r = vld1q_u16(ref + offset);
+
+ if (w < 8) {
+ // Mask out-of-range elements.
+ s = vandq_u16(s, remainder_mask);
+ r = vandq_u16(r, remainder_mask);
+ }
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ offset += 8;
+ w -= 8;
+ } while (w > 0);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride, int width,
+ int height) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ switch (width) {
+ case 4:
+ return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+ case 8:
+ return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+ default:
+ return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/aom_dsp/arm/highbd_subpel_variance_neon.c b/aom_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 000000000..bdbbf7058
--- /dev/null
+++ b/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, blend);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height,
+ int filter_offset) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, blend);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 8, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 16, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 32, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 128, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
+ src_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 8, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 128, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with aom_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+ uint16x8_t p = vld1q_u16(second_pred);
+ avg = vrhaddq_u16(avg, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = vrhaddq_u16(s, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp, source_stride, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp, source_stride, source_stride, h, yoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp0, source_stride, 1, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp0, source_stride, 1, h, xoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \
+ h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ if (xoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ if (yoffset == 0) { \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \
+ w, h, src, src_stride, msk, msk_stride, \
+ invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \
+ w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \
+ src_stride, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ if (yoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ }
+
+#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ pre, pre_stride, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \
+ pre_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+// 10-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+// 12-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+#endif // !CONFIG_REALTIME_ONLY
+
+static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width,
+ int dst_height,
+ const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
+ const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
+
+ vst1_u16(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp, source_stride, source_stride, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp0, source_stride, 1, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp0, source_stride, 1, h, xoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_variance_neon.c b/aom_dsp/arm/highbd_variance_neon.c
index 948f2f788..e54fc1805 100644
--- a/aom_dsp/arm/highbd_variance_neon.c
+++ b/aom_dsp/arm/highbd_variance_neon.c
@@ -15,10 +15,10 @@
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/variance.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
// Process a block of width 4 two rows at a time.
static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
@@ -412,67 +412,6 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
return *sse;
}
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
- int src_stride,
- const uint16_t *ref_ptr,
- int ref_stride, int h,
- unsigned int *sse) {
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h / 2;
- do {
- uint16x8_t s0 = vld1q_u16(src_ptr);
- src_ptr += src_stride;
- uint16x8_t s1 = vld1q_u16(src_ptr);
- src_ptr += src_stride;
- uint16x8_t r0 = vld1q_u16(ref_ptr);
- ref_ptr += ref_stride;
- uint16x8_t r1 = vld1q_u16(ref_ptr);
- ref_ptr += ref_stride;
-
- uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
- uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
- uint8x16_t diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, diff, diff);
- } while (--i != 0);
-
- *sse = horizontal_add_u32x4(sse_u32);
- return *sse;
-}
-
-static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
- int src_stride,
- const uint16_t *ref_ptr,
- int ref_stride, int h,
- unsigned int *sse) {
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h;
- do {
- uint16x8_t s0 = vld1q_u16(src_ptr);
- uint16x8_t s1 = vld1q_u16(src_ptr + 8);
- uint16x8_t r0 = vld1q_u16(ref_ptr);
- uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
-
- uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
- uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
- uint8x16_t diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, diff, diff);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- *sse = horizontal_add_u32x4(sse_u32);
- return *sse;
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
int src_stride,
const uint16_t *ref_ptr,
@@ -491,8 +430,6 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
sse);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
#define HIGHBD_MSE_WXH_NEON(w, h) \
uint32_t aom_highbd_8_mse##w##x##h##_neon( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
@@ -529,3 +466,55 @@ HIGHBD_MSE_WXH_NEON(8, 16)
HIGHBD_MSE_WXH_NEON(8, 8)
#undef HIGHBD_MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+ uint16x8_t s1, uint16x8_t d0,
+ uint16x8_t d1) {
+ uint16x8_t e0 = vabdq_u16(s0, d0);
+ uint16x8_t e1 = vabdq_u16(s1, d1);
+
+ uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0));
+ mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0));
+ mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1));
+ mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1));
+
+ return vpadalq_u32(sum, mse);
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+ uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else { // w == 4
+ do {
+ uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+ uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return horizontal_add_u64x2(sum);
+}
diff --git a/aom_dsp/arm/highbd_variance_neon_dotprod.c b/aom_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 000000000..d56ae9757
--- /dev/null
+++ b/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h / 2;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t s1 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+ uint16x8_t r1 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \
+ sse); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 21613789b..41f070e1d 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -24,7 +24,7 @@
// DC 4x4
static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
- const uint8x8_t a = load_u8_4x1_lane0(in);
+ const uint8x8_t a = load_u8_4x1(in);
const uint16x4_t p0 = vpaddl_u8(a);
const uint16x4_t p1 = vpadd_u16(p0, p0);
return vcombine_u16(p1, vdup_n_u16(0));
@@ -354,7 +354,7 @@ static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x8_t a = load_u8_4x1_lane0(above);
+ uint8x8_t a = load_u8_4x1(above);
uint8x8_t l = vld1_u8(left);
uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
@@ -364,7 +364,7 @@ void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
uint8x8_t a = vld1_u8(above);
- uint8x8_t l = load_u8_4x1_lane0(left);
+ uint8x8_t l = load_u8_4x1(left);
uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
@@ -372,7 +372,7 @@ void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x8_t a = load_u8_4x1_lane0(above);
+ uint8x8_t a = load_u8_4x1(above);
uint8x16_t l = vld1q_u8(left);
uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
uint32_t sum = horizontal_add_u16x8(sum_al);
@@ -383,7 +383,7 @@ void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
uint8x16_t a = vld1q_u8(above);
- uint8x8_t l = load_u8_4x1_lane0(left);
+ uint8x8_t l = load_u8_4x1(left);
uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
uint32_t sum = horizontal_add_u16x8(sum_al);
uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
@@ -620,7 +620,7 @@ static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
- v_store_4xh(dst, stride, 4, load_u8_4x1_lane0(above));
+ v_store_4xh(dst, stride, 4, load_u8_4x1(above));
}
void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -646,13 +646,13 @@ void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
- v_store_4xh(dst, stride, 8, load_u8_4x1_lane0(above));
+ v_store_4xh(dst, stride, 8, load_u8_4x1(above));
}
void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
- v_store_4xh(dst, stride, 16, load_u8_4x1_lane0(above));
+ v_store_4xh(dst, stride, 16, load_u8_4x1(above));
}
void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -856,7 +856,7 @@ static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t d0 = load_u8_4x1_lane0(left);
+ const uint8x8_t d0 = load_u8_4x1(left);
(void)above;
store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0);
store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0);
@@ -907,7 +907,7 @@ void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t d0 = load_u8_4x1_lane0(left);
+ const uint8x8_t d0 = load_u8_4x1(left);
(void)above;
vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
@@ -936,7 +936,7 @@ void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t d0 = load_u8_4x1_lane0(left);
+ const uint8x8_t d0 = load_u8_4x1(left);
(void)above;
vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
@@ -1594,8 +1594,10 @@ static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
#if AOM_ARCH_AARCH64
- uint8x8_t left_idx0 = vreinterpret_u8_s16(base_y_c64 + 2); // [0, 16]
- uint8x8_t left_idx1 = vreinterpret_u8_s16(base_y_c64 + 3); // [1, 17]
+ uint8x8_t left_idx0 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16]
+ uint8x8_t left_idx1 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17]
uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8);
uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8);
@@ -1777,8 +1779,10 @@ static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
#if AOM_ARCH_AARCH64
- uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c128 + 2); // [0, 33]
- uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c128 + 3); // [1, 34]
+ uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33]
+ uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34]
uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
@@ -2025,8 +2029,10 @@ static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
#if AOM_ARCH_AARCH64
// Values in left_idx{0,1} range from 0 through 63 inclusive.
- uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c256.val[0] + 1);
- uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c256.val[1] + 1);
+ uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c256.val[0], vdupq_n_s16(1)));
+ uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c256.val[1], vdupq_n_s16(1)));
uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
@@ -3168,12 +3174,10 @@ static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t bottom_left = left_column[height - 1];
const uint8_t *const weights_y = smooth_weights + height - 4;
- uint8x8_t UNINITIALIZED_IS_SAFE(top_v);
- load_u8_4x1(top_row, &top_v, 0);
+ uint8x8_t top_v = load_u8_4x1(top_row);
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
- uint8x8_t UNINITIALIZED_IS_SAFE(weights_x_v);
- load_u8_4x1(smooth_weights, &weights_x_v, 0);
+ uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
@@ -3403,9 +3407,9 @@ SMOOTH_NXM_WIDE(64, 64)
const uint8_t bottom_left = left_column[height - 1]; \
const uint8_t *const weights_y = smooth_weights + height - 4; \
\
- uint8x8_t UNINITIALIZED_IS_SAFE(top_v); \
+ uint8x8_t top_v; \
if ((W) == 4) { \
- load_u8_4x1(top_row, &top_v, 0); \
+ top_v = load_u8_4x1(top_row); \
} else { /* width == 8 */ \
top_v = vld1_u8(top_row); \
} \
@@ -3717,9 +3721,9 @@ static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
int width, int height) {
const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
- uint8x8_t UNINITIALIZED_IS_SAFE(top);
+ uint8x8_t top;
if (width == 4) {
- load_u8_4x1(top_row, &top, 0);
+ top = load_u8_4x1(top_row);
} else { // width == 8
top = vld1_u8(top_row);
}
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 8fc7ccb10..0e683a784 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -634,13 +634,13 @@ void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
p6p2 = vget_low_u8(row1);
p5p1 = vget_low_u8(row2);
p4p0 = vget_low_u8(row3);
- transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
q0q4 = vget_high_u8(row0);
q1q5 = vget_high_u8(row1);
q2q6 = vget_high_u8(row2);
q3qy = vget_high_u8(row3);
- transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
@@ -679,13 +679,13 @@ void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
- transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
- transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
row0 = vcombine_u8(pxp3, q0q4);
row1 = vcombine_u8(p6p2, q1q5);
@@ -725,7 +725,7 @@ void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
// row3: p3 p2 p1 p0 | q0 q1 q2 q3
load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
- transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
@@ -750,7 +750,7 @@ void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
- transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
}
@@ -784,7 +784,7 @@ void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
// row3: px p2 p1 p0 | q0 q1 q2 qy
load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
- transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
@@ -809,7 +809,7 @@ void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
- transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
}
@@ -834,7 +834,7 @@ void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
uint32x2_t pq_rev;
- uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), UNINITIALIZED_IS_SAFE(q0q1);
+ uint8x8_t p1p0, q0q1;
uint8x8_t p0q0, p1q1;
// row0: p1 p0 | q0 q1
@@ -843,7 +843,7 @@ void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
// row3: p1 p0 | q0 q1
load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
- transpose_u8_4x4(&p1p0, &q0q1);
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
@@ -860,7 +860,7 @@ void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
- transpose_u8_4x4(&p1p0, &q0q1);
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
store_unaligned_u8_4x1(src - 2, p1p0, 0);
store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
@@ -886,25 +886,13 @@ void aom_lpf_vertical_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1),
- UNINITIALIZED_IS_SAFE(p2q2), UNINITIALIZED_IS_SAFE(p3q3),
- UNINITIALIZED_IS_SAFE(p4q4), UNINITIALIZED_IS_SAFE(p5q5),
- UNINITIALIZED_IS_SAFE(p6q6);
-
- load_u8_4x1(src - 7 * stride, &p6q6, 0);
- load_u8_4x1(src - 6 * stride, &p5q5, 0);
- load_u8_4x1(src - 5 * stride, &p4q4, 0);
- load_u8_4x1(src - 4 * stride, &p3q3, 0);
- load_u8_4x1(src - 3 * stride, &p2q2, 0);
- load_u8_4x1(src - 2 * stride, &p1q1, 0);
- load_u8_4x1(src - 1 * stride, &p0q0, 0);
- load_u8_4x1(src + 0 * stride, &p0q0, 1);
- load_u8_4x1(src + 1 * stride, &p1q1, 1);
- load_u8_4x1(src + 2 * stride, &p2q2, 1);
- load_u8_4x1(src + 3 * stride, &p3q3, 1);
- load_u8_4x1(src + 4 * stride, &p4q4, 1);
- load_u8_4x1(src + 5 * stride, &p5q5, 1);
- load_u8_4x1(src + 6 * stride, &p6q6, 1);
+ uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride);
+ uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride);
+ uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride);
+ uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride);
+ uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride);
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
*thresh);
@@ -1036,12 +1024,8 @@ void aom_lpf_horizontal_6_quad_neon(uint8_t *s, int pitch,
void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1);
-
- load_u8_4x1(src - 2 * stride, &p1q1, 0);
- load_u8_4x1(src - 1 * stride, &p0q0, 0);
- load_u8_4x1(src + 0 * stride, &p0q0, 1);
- load_u8_4x1(src + 1 * stride, &p1q1, 1);
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
diff --git a/aom_dsp/arm/masked_sad4d_neon.c b/aom_dsp/arm/masked_sad4d_neon.c
index 98daeda66..8f65b805e 100644
--- a/aom_dsp/arm/masked_sad4d_neon.c
+++ b/aom_dsp/arm/masked_sad4d_neon.c
@@ -516,19 +516,18 @@ static INLINE void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride,
vst1q_u32(res, horizontal_add_4d_u16x8(sum));
}
-#define MASKED_SAD4D_WXH_NEON(w, h) \
- void aom_masked_sad##w##x##h##x4d_neon( \
- const uint8_t *src, int src_stride, const uint8_t *ref[4], \
- int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
- int msk_stride, int invert_mask, uint32_t res[4]) { \
- if (invert_mask) { \
- return masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
- second_pred, msk, msk_stride, res, \
- h); \
- } else { \
- return masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
- second_pred, msk, msk_stride, res, h); \
- } \
+#define MASKED_SAD4D_WXH_NEON(w, h) \
+ void aom_masked_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int invert_mask, uint32_t res[4]) { \
+ if (invert_mask) { \
+ masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
+ second_pred, msk, msk_stride, res, h); \
+ } else { \
+ masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
+ msk, msk_stride, res, h); \
+ } \
}
MASKED_SAD4D_WXH_NEON(4, 8)
diff --git a/aom_dsp/arm/masked_sad_neon.c b/aom_dsp/arm/masked_sad_neon.c
index 340df05e5..9d263105e 100644
--- a/aom_dsp/arm/masked_sad_neon.c
+++ b/aom_dsp/arm/masked_sad_neon.c
@@ -15,9 +15,10 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/blend.h"
-#include "mem_neon.h"
-#include "sum_neon.h"
static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
const uint8_t *src,
@@ -29,15 +30,7 @@ static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
uint8x16_t b0 = vld1q_u8(b);
uint8x16_t s0 = vld1q_u8(src);
- uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
- uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
- uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
- blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
- blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
-
- uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
- uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
- uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0);
return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
}
@@ -164,10 +157,7 @@ static INLINE unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride,
uint8x8_t b0 = vld1_u8(b);
uint8x8_t s0 = vld1_u8(src);
- uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
- uint16x8_t blend_u16 = vmull_u8(m0, a0);
- blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
@@ -199,10 +189,7 @@ static INLINE unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride,
uint8x8_t b0 = load_unaligned_u8(b, b_stride);
uint8x8_t s0 = load_unaligned_u8(src, src_stride);
- uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
- uint16x8_t blend_u16 = vmull_u8(m0, a0);
- blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 16d44c527..d1ac648d1 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -43,6 +43,11 @@ static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
return res;
}
+static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+ uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
+ return res;
+}
+
static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
@@ -85,18 +90,31 @@ static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
}
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define load_u8_4x1(s, s0, lane) \
- do { \
- *(s0) = vreinterpret_u8_u32( \
- vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
- } while (0)
-
// Load four bytes into the low half of a uint8x8_t, zero the upper half.
-static INLINE uint8x8_t load_u8_4x1_lane0(const uint8_t *p) {
+static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ return ret;
+}
+
+static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
uint8x8_t ret = vdup_n_u8(0);
- load_u8_4x1(p, &ret, 0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ p += stride;
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
+ return ret;
+}
+
+static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+ uint16x4_t ret = vdup_n_u16(0);
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
+ p += stride;
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
return ret;
}
@@ -214,6 +232,38 @@ static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
s += p;
}
+static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7,
+ int16x4_t *const s8, int16x4_t *const s9,
+ int16x4_t *const s10, int16x4_t *const s11) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+ s += p;
+ *s8 = vld1_s16(s);
+ s += p;
+ *s9 = vld1_s16(s);
+ s += p;
+ *s10 = vld1_s16(s);
+ s += p;
+ *s11 = vld1_s16(s);
+}
+
static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
int16x4_t *const s0, int16x4_t *const s1,
int16x4_t *const s2, int16x4_t *const s3,
@@ -316,6 +366,23 @@ static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
*s6 = vld1_s16(s);
}
+static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+}
+
static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
int16x4_t *const s0, int16x4_t *const s1,
int16x4_t *const s2, int16x4_t *const s3,
@@ -592,6 +659,33 @@ static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
*s10 = vld1_u8(s);
}
+static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+}
+
static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
int16x8_t *const s0, int16x8_t *const s1,
int16x8_t *const s2, int16x8_t *const s3,
@@ -622,6 +716,38 @@ static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
*s10 = vld1q_s16(s);
}
+static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9,
+ int16x8_t *const s10, int16x8_t *const s11) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+ s += p;
+ *s10 = vld1q_s16(s);
+ s += p;
+ *s11 = vld1q_s16(s);
+}
+
static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
uint16x8_t *const s0, uint16x8_t *const s1,
uint16x8_t *const s2, uint16x8_t *const s3,
@@ -714,6 +840,23 @@ static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
*s6 = vld1q_s16(s);
}
+static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+}
+
static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
int16x8_t *const s0, int16x8_t *const s1,
int16x8_t *const s2, int16x8_t *const s3,
@@ -793,6 +936,24 @@ static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
return vreinterpret_u8_u32(a_u32);
}
+static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(a);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+ uint16_t a;
+ uint16x4_t a_u32;
+
+ memcpy(&a, buf, 2);
+ a_u32 = vdup_n_u16(a);
+ return vreinterpret_u8_u16(a_u32);
+}
+
static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
uint32_t a;
uint32x2_t a_u32;
@@ -844,6 +1005,20 @@ static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
memcpy(dst, &a, 2); \
} while (0)
+#define store_unaligned_u16_2x1(dst, src, lane) \
+ do { \
+ uint32_t a; \
+ a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_unaligned_u16_4x1(dst, src, lane) \
+ do { \
+ uint64_t a; \
+ a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+ memcpy(dst, &a, 8); \
+ } while (0)
+
static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2, uint8x16_t *const s3,
@@ -917,6 +1092,27 @@ static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
*s7 = vld1q_u16(s + 8);
}
+static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+ int stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u16_u32(a_u32);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+ uint64_t a;
+ uint64x1_t a_u64 = vdup_n_u64(0);
+ memcpy(&a, buf, 8);
+ a_u64 = vset_lane_u64(a, a_u64, 0);
+ return vreinterpret_u16_u64(a_u64);
+}
+
static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
uint32_t stride) {
uint64_t a;
@@ -1004,4 +1200,32 @@ static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
vst1q_s32(buf, v0);
}
+static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride,
+ uint8x8_t src) {
+ store_unaligned_u8_2x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u8_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride,
+ uint8x8_t src) {
+ store_unaligned_u8_4x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u8_4x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x4_t src) {
+ store_unaligned_u16_2x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u16_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x8_t src) {
+ store_unaligned_u16_4x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u16_4x1(dst, src, 1);
+}
+
#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 60efef8b0..46a166633 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -15,93 +15,10 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int w, int h) {
- // Only two accumulators are required for optimal instruction throughput of
- // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h;
- do {
- int j = 0;
- do {
- uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr + j);
- r0 = vld1q_u8(ref_ptr + j);
- diff0 = vabdq_u8(s0, r0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- s1 = vld1q_u8(src_ptr + j + 16);
- r1 = vld1q_u8(ref_ptr + j + 16);
- diff1 = vabdq_u8(s1, r1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- j += 32;
- } while (j < w);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
-}
-
-static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
-}
-
-static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
-}
-
-static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h / 2;
- do {
- uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr);
- r0 = vld1q_u8(ref_ptr);
- diff0 = vabdq_u8(s0, r0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
-
- s1 = vld1q_u8(src_ptr);
- r1 = vld1q_u8(ref_ptr);
- diff1 = vabdq_u8(s1, r1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
int h) {
@@ -220,28 +137,25 @@ static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
int h) {
- uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
int i = h;
do {
uint8x16_t s0 = vld1q_u8(src_ptr);
uint8x16_t r0 = vld1q_u8(ref_ptr);
uint8x16_t diff0 = vabdq_u8(s0, r0);
- uint16x8_t sum0 = vpaddlq_u8(diff0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
uint8x16_t s1 = vld1q_u8(src_ptr + 16);
uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
uint8x16_t diff1 = vabdq_u8(s1, r1);
- uint16x8_t sum1 = vpaddlq_u8(diff1);
-
- sum = vpadalq_u16(sum, sum0);
- sum = vpadalq_u16(sum, sum1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
src_ptr += src_stride;
ref_ptr += ref_stride;
} while (--i != 0);
- return horizontal_add_u32x4(sum);
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
}
static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -264,8 +178,6 @@ static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
return horizontal_add_u16x8(sum);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
int h) {
@@ -384,114 +296,6 @@ SAD_SKIP_WXH_NEON(64, 16)
#undef SAD_SKIP_WXH_NEON
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int w, int h,
- const uint8_t *second_pred) {
- // Only two accumulators are required for optimal instruction throughput of
- // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h;
- do {
- int j = 0;
- do {
- uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr + j);
- r0 = vld1q_u8(ref_ptr + j);
- p0 = vld1q_u8(second_pred);
- avg0 = vrhaddq_u8(r0, p0);
- diff0 = vabdq_u8(s0, avg0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- s1 = vld1q_u8(src_ptr + j + 16);
- r1 = vld1q_u8(ref_ptr + j + 16);
- p1 = vld1q_u8(second_pred + 16);
- avg1 = vrhaddq_u8(r1, p1);
- diff1 = vabdq_u8(s1, avg1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- j += 32;
- second_pred += 32;
- } while (j < w);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
- second_pred);
-}
-
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
- second_pred);
-}
-
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
- second_pred);
-}
-
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h / 2;
- do {
- uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr);
- r0 = vld1q_u8(ref_ptr);
- p0 = vld1q_u8(second_pred);
- avg0 = vrhaddq_u8(r0, p0);
- diff0 = vabdq_u8(s0, avg0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- second_pred += 16;
-
- s1 = vld1q_u8(src_ptr);
- r1 = vld1q_u8(ref_ptr);
- p1 = vld1q_u8(second_pred);
- avg1 = vrhaddq_u8(r1, p1);
- diff1 = vabdq_u8(s1, avg1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- second_pred += 16;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -644,7 +448,7 @@ static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
const uint8_t *ref_ptr,
int ref_stride, int h,
const uint8_t *second_pred) {
- uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
int i = h;
do {
@@ -653,24 +457,21 @@ static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
uint8x16_t p0 = vld1q_u8(second_pred);
uint8x16_t avg0 = vrhaddq_u8(r0, p0);
uint8x16_t diff0 = vabdq_u8(s0, avg0);
- uint16x8_t sum0 = vpaddlq_u8(diff0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
uint8x16_t s1 = vld1q_u8(src_ptr + 16);
uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
uint8x16_t p1 = vld1q_u8(second_pred + 16);
uint8x16_t avg1 = vrhaddq_u8(r1, p1);
uint8x16_t diff1 = vabdq_u8(s1, avg1);
- uint16x8_t sum1 = vpaddlq_u8(diff1);
-
- sum = vpadalq_u16(sum, sum0);
- sum = vpadalq_u16(sum, sum1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
src_ptr += src_stride;
ref_ptr += ref_stride;
second_pred += 32;
} while (--i != 0);
- return horizontal_add_u32x4(sum);
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
}
static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
@@ -698,8 +499,6 @@ static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
return horizontal_add_u16x8(sum);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -788,3 +587,287 @@ SAD_WXH_AVG_NEON(64, 16)
#endif // !CONFIG_REALTIME_ONLY
#undef SAD_WXH_AVG_NEON
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
+ uint8x16_t diff = vabdq_u8(s, wtd_avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred, jcp_param); \
+ }
+
+DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
+
+DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
+
+DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 000000000..5504c6838
--- /dev/null
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+SAD_WXH_NEON_DOTPROD(64, 128)
+
+SAD_WXH_NEON_DOTPROD(128, 64)
+SAD_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON_DOTPROD(16, 4)
+SAD_WXH_NEON_DOTPROD(16, 64)
+SAD_WXH_NEON_DOTPROD(32, 8)
+SAD_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ p1 = vld1q_u8(second_pred);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to minimize the accumulation and loop carried
+ // dependencies for better instruction throughput.
+ uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1));
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1));
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1));
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[4] = vaddq_u32(sum[4], sum[5]);
+ sum[6] = vaddq_u32(sum[6], sum[7]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ sum[4] = vaddq_u32(sum[4], sum[6]);
+ sum[0] = vaddq_u32(sum[0], sum[4]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ uint8x16_t s1 = vld1q_u8(src_ptr);
+ uint8x16_t r1 = vld1q_u8(ref_ptr);
+ uint8x16_t p1 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return dist_wtd_sad##w##xh_avg_neon_dotprod( \
+ src, src_stride, ref, ref_stride, (h), second_pred, jcp_param); \
+ }
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD
diff --git a/aom_dsp/arm/sadxd_neon.c b/aom_dsp/arm/sadxd_neon.c
index 81803b185..e89e1c5a7 100644
--- a/aom_dsp/arm/sadxd_neon.c
+++ b/aom_dsp/arm/sadxd_neon.c
@@ -18,90 +18,6 @@
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
- uint32x4_t *const sad_sum) {
- uint8x16_t abs_diff = vabdq_u8(src, ref);
- *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
-}
-
-static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4],
- int ref_stride, uint32_t res[4], int w,
- int h) {
- uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
- uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int ref_offset = 0;
- int i = h;
- do {
- int j = 0;
- do {
- const uint8x16_t s0 = vld1q_u8(src + j);
- sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
-
- const uint8x16_t s1 = vld1q_u8(src + j + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
-
- j += 32;
- } while (j < w);
-
- src += src_stride;
- ref_offset += ref_stride;
- } while (--i != 0);
-
- res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
- res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
- res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
-}
-
-static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h);
-}
-
-static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h);
-}
-
-static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 32, h);
-}
-
-static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int ref_offset = 0;
- int i = h;
- do {
- const uint8x16_t s = vld1q_u8(src);
- sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
- sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
- sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
-
- src += src_stride;
- ref_offset += ref_stride;
- } while (--i != 0);
-
- res[0] = horizontal_add_u32x4(sum[0]);
- res[1] = horizontal_add_u32x4(sum[1]);
- res[2] = horizontal_add_u32x4(sum[2]);
-}
-
-#else // !(defined(__ARM_FEATURE_DOTPROD))
-
static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
uint16x8_t *const sad_sum) {
uint8x16_t abs_diff = vabdq_u8(src, ref);
@@ -218,8 +134,6 @@ static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
res[2] = horizontal_add_u16x8(sum[2]);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[3], int ref_stride,
uint32_t res[3], int h) {
@@ -325,92 +239,6 @@ SAD_WXH_3D_NEON(64, 16)
#undef SAD_WXH_3D_NEON
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4],
- int ref_stride, uint32_t res[4], int w,
- int h) {
- uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum[4];
-
- int ref_offset = 0;
- int i = h;
- do {
- int j = 0;
- do {
- const uint8x16_t s0 = vld1q_u8(src + j);
- sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
- sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
-
- const uint8x16_t s1 = vld1q_u8(src + j + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
- sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
-
- j += 32;
- } while (j < w);
-
- src += src_stride;
- ref_offset += ref_stride;
- } while (--i != 0);
-
- sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
- sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
- sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
- sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h);
-}
-
-static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h);
-}
-
-static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 32, h);
-}
-
-static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
- uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
-
- int ref_offset = 0;
- int i = h;
- do {
- const uint8x16_t s = vld1q_u8(src);
- sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
- sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
- sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
- sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
-
- src += src_stride;
- ref_offset += ref_stride;
- } while (--i != 0);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-#else // !(defined(__ARM_FEATURE_DOTPROD))
-
static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4],
int ref_stride, uint32_t res[4], int w,
@@ -534,8 +362,6 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
diff --git a/aom_dsp/arm/sadxd_neon_dotprod.c b/aom_dsp/arm/sadxd_neon_dotprod.c
new file mode 100644
index 000000000..3d11d1cb9
--- /dev/null
+++ b/aom_dsp/arm/sadxd_neon_dotprod.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint32x4_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+ res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+ res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON_DOTPROD(w, h) \
+ void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_3D_NEON_DOTPROD(16, 8)
+SAD_WXH_3D_NEON_DOTPROD(16, 16)
+SAD_WXH_3D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_3D_NEON_DOTPROD(32, 16)
+SAD_WXH_3D_NEON_DOTPROD(32, 32)
+SAD_WXH_3D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_3D_NEON_DOTPROD(64, 32)
+SAD_WXH_3D_NEON_DOTPROD(64, 64)
+SAD_WXH_3D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_3D_NEON_DOTPROD(128, 64)
+SAD_WXH_3D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON_DOTPROD(16, 4)
+SAD_WXH_3D_NEON_DOTPROD(16, 64)
+SAD_WXH_3D_NEON_DOTPROD(32, 8)
+SAD_WXH_3D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON_DOTPROD
+
+static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h) \
+ void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_WXH_4D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \
+ void aom_sad_skip_##w##x##h##x4d_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+ ((h) >> 1)); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index d1d3d93ed..ec8f0ee18 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -11,119 +11,8 @@
#include <arm_neon.h>
#include "config/aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
-#include "aom_dsp/arm/transpose_neon.h"
-
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
- uint32x4_t *sse) {
- uint8x16_t s = vld1q_u8(src);
- uint8x16_t r = vld1q_u8(ref);
-
- uint8x16_t abs_diff = vabdq_u8(s, r);
-
- *sse = vdotq_u32(*sse, abs_diff, abs_diff);
-}
-
-static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
- uint32x2_t *sse) {
- uint8x8_t s = vld1_u8(src);
- uint8x8_t r = vld1_u8(ref);
-
- uint8x8_t abs_diff = vabd_u8(s, r);
-
- *sse = vdot_u32(*sse, abs_diff, abs_diff);
-}
-
-static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- uint32x2_t *sse) {
- uint8x8_t s = load_unaligned_u8(src, src_stride);
- uint8x8_t r = load_unaligned_u8(ref, ref_stride);
-
- uint8x8_t abs_diff = vabd_u8(s, r);
-
- *sse = vdot_u32(*sse, abs_diff, abs_diff);
-}
-
-static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int height) {
- uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
-
- int i = height;
- do {
- sse_8x1_neon(src, ref, &sse[0]);
- src += src_stride;
- ref += ref_stride;
- sse_8x1_neon(src, ref, &sse[1]);
- src += src_stride;
- ref += ref_stride;
- i -= 2;
- } while (i != 0);
-
- return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
-}
-
-static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int height) {
- uint32x2_t sse = vdup_n_u32(0);
-
- int i = height;
- do {
- sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
-
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- i -= 2;
- } while (i != 0);
-
- return horizontal_add_u32x2(sse);
-}
-
-static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int width, int height) {
- uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
-
- if ((width & 0x07) && ((width & 0x07) < 5)) {
- int i = height;
- do {
- int j = 0;
- do {
- sse_8x1_neon(src + j, ref + j, &sse[0]);
- sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse[1]);
- j += 8;
- } while (j + 4 < width);
-
- sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse[0]);
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- i -= 2;
- } while (i != 0);
- } else {
- int i = height;
- do {
- int j = 0;
- do {
- sse_8x1_neon(src + j, ref + j, &sse[0]);
- sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse[1]);
- j += 8;
- } while (j < width);
-
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- i -= 2;
- } while (i != 0);
- }
- return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
uint32x4_t *sse) {
@@ -159,39 +48,6 @@ static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
*sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
}
-static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int height) {
- uint32x4_t sse = vdupq_n_u32(0);
-
- int i = height;
- do {
- sse_8x1_neon(src, ref, &sse);
-
- src += src_stride;
- ref += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(sse);
-}
-
-static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int height) {
- uint32x4_t sse = vdupq_n_u32(0);
-
- int i = height;
- do {
- sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
-
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- i -= 2;
- } while (i != 0);
-
- return horizontal_add_u32x4(sse);
-}
-
static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int width, int height) {
@@ -228,8 +84,6 @@ static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
return horizontal_add_u32x4(sse);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int height) {
@@ -308,281 +162,49 @@ static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
}
-int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
- int ref_stride, int width, int height) {
- switch (width) {
- case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
- case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
- case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
- case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
- case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
- case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
- default:
- return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
- }
-}
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
-#if CONFIG_AV1_HIGHBITDEPTH
-static INLINE uint32_t highbd_sse_W8x1_neon(uint16x8_t q2, uint16x8_t q3) {
- uint32_t sse;
- const uint32_t sse1 = 0;
- const uint32x4_t q1 = vld1q_dup_u32(&sse1);
+ int i = height;
+ do {
+ sse_8x1_neon(src, ref, &sse);
- uint16x8_t q4 = vabdq_u16(q2, q3); // diff = abs(a[x] - b[x])
- uint16x4_t d0 = vget_low_u16(q4);
- uint16x4_t d1 = vget_high_u16(q4);
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
- uint32x4_t q6 = vmlal_u16(q1, d0, d0);
- uint32x4_t q7 = vmlal_u16(q1, d1, d1);
+ return horizontal_add_u32x4(sse);
+}
- uint32x2_t d4 = vadd_u32(vget_low_u32(q6), vget_high_u32(q6));
- uint32x2_t d5 = vadd_u32(vget_low_u32(q7), vget_high_u32(q7));
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
- uint32x2_t d6 = vadd_u32(d4, d5);
+ int i = height;
+ do {
+ sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
- sse = vget_lane_u32(d6, 0);
- sse += vget_lane_u32(d6, 1);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
- return sse;
+ return horizontal_add_u32x4(sse);
}
-int64_t aom_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,
- int b_stride, int width, int height) {
- static const uint16_t k01234567[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
- const uint16x8_t q0 = vld1q_u16(k01234567);
- int64_t sse = 0;
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- int x, y;
- int addinc;
- uint16x4_t d0, d1, d2, d3;
- uint16_t dx;
- uint16x8_t q2, q3, q4, q5;
-
+int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, int width, int height) {
switch (width) {
- case 4:
- for (y = 0; y < height; y += 2) {
- d0 = vld1_u16(a); // load 4 data
- a += a_stride;
- d1 = vld1_u16(a);
- a += a_stride;
-
- d2 = vld1_u16(b);
- b += b_stride;
- d3 = vld1_u16(b);
- b += b_stride;
- q2 = vcombine_u16(d0, d1); // make a 8 data vector
- q3 = vcombine_u16(d2, d3);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
- }
- break;
- case 8:
- for (y = 0; y < height; y++) {
- q2 = vld1q_u16(a);
- q3 = vld1q_u16(b);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- a += a_stride;
- b += b_stride;
- }
- break;
- case 16:
- for (y = 0; y < height; y++) {
- q2 = vld1q_u16(a);
- q3 = vld1q_u16(b);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 8);
- q3 = vld1q_u16(b + 8);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- a += a_stride;
- b += b_stride;
- }
- break;
- case 32:
- for (y = 0; y < height; y++) {
- q2 = vld1q_u16(a);
- q3 = vld1q_u16(b);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 8);
- q3 = vld1q_u16(b + 8);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 16);
- q3 = vld1q_u16(b + 16);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 24);
- q3 = vld1q_u16(b + 24);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- a += a_stride;
- b += b_stride;
- }
- break;
- case 64:
- for (y = 0; y < height; y++) {
- q2 = vld1q_u16(a);
- q3 = vld1q_u16(b);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 8);
- q3 = vld1q_u16(b + 8);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 16);
- q3 = vld1q_u16(b + 16);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 24);
- q3 = vld1q_u16(b + 24);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 32);
- q3 = vld1q_u16(b + 32);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 40);
- q3 = vld1q_u16(b + 40);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 48);
- q3 = vld1q_u16(b + 48);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 56);
- q3 = vld1q_u16(b + 56);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- a += a_stride;
- b += b_stride;
- }
- break;
- case 128:
- for (y = 0; y < height; y++) {
- q2 = vld1q_u16(a);
- q3 = vld1q_u16(b);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 8);
- q3 = vld1q_u16(b + 8);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 16);
- q3 = vld1q_u16(b + 16);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 24);
- q3 = vld1q_u16(b + 24);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 32);
- q3 = vld1q_u16(b + 32);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 40);
- q3 = vld1q_u16(b + 40);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 48);
- q3 = vld1q_u16(b + 48);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 56);
- q3 = vld1q_u16(b + 56);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 64);
- q3 = vld1q_u16(b + 64);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 72);
- q3 = vld1q_u16(b + 72);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 80);
- q3 = vld1q_u16(b + 80);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 88);
- q3 = vld1q_u16(b + 88);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 96);
- q3 = vld1q_u16(b + 96);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 104);
- q3 = vld1q_u16(b + 104);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 112);
- q3 = vld1q_u16(b + 112);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
-
- q2 = vld1q_u16(a + 120);
- q3 = vld1q_u16(b + 120);
-
- sse += highbd_sse_W8x1_neon(q2, q3);
- a += a_stride;
- b += b_stride;
- }
- break;
+ case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+ case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+ case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+ case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+ case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+ case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
default:
-
- for (y = 0; y < height; y++) {
- x = width;
- while (x > 0) {
- addinc = width - x;
- q2 = vld1q_u16(a + addinc);
- q3 = vld1q_u16(b + addinc);
- if (x < 8) {
- dx = x;
- q4 = vld1q_dup_u16(&dx);
- q5 = vcltq_u16(q0, q4);
- q2 = vandq_u16(q2, q5);
- q3 = vandq_u16(q3, q5);
- }
- sse += highbd_sse_W8x1_neon(q2, q3);
- x -= 8;
- }
- a += a_stride;
- b += b_stride;
- }
+ return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
}
- return (int64_t)sse;
}
-#endif
diff --git a/aom_dsp/arm/sse_neon_dotprod.c b/aom_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 000000000..979049780
--- /dev/null
+++ b/aom_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+ uint32x4_t *sse) {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+ uint32x2_t *sse) {
+ uint8x8_t s = vld1_u8(src);
+ uint8x8_t r = vld1_u8(ref);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32x2_t *sse) {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int width, int height) {
+ uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ if ((width & 0x07) && ((width & 0x07) < 5)) {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+ sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+ &sse[1]);
+ j += 8;
+ } while (j + 4 < width);
+
+ sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ } else {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+ sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+ &sse[1]);
+ j += 8;
+ } while (j < width);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ }
+ return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+ sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
+ sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
+ sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
+ sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_16x1_neon_dotprod(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_8x1_neon_dotprod(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_8x1_neon_dotprod(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x2_t sse = vdup_n_u32(0);
+
+ int i = height;
+ do {
+ sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x2(sse);
+}
+
+int64_t aom_sse_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int width,
+ int height) {
+ switch (width) {
+ case 4:
+ return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 8:
+ return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ default:
+ return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 9599ae06d..2e6e73885 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -18,6 +18,7 @@
#include "aom/aom_integer.h"
#include "aom_dsp/variance.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
#include "aom_dsp/arm/mem_neon.h"
static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
@@ -154,59 +155,58 @@ static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
}
-#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
- unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, unsigned int *sse) { \
- if (xoffset == 0) { \
- if (yoffset == 0) { \
- return aom_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
- sse); \
- } else if (yoffset == 4) { \
- uint8_t tmp[w * h]; \
- var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
- return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
- } else { \
- uint8_t tmp[w * h]; \
- var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
- yoffset); \
- return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
- } \
- } else if (xoffset == 4) { \
- uint8_t tmp0[w * (h + padding)]; \
- if (yoffset == 0) { \
- var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
- return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
- } else if (yoffset == 4) { \
- uint8_t tmp1[w * (h + padding)]; \
- var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
- var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
- } else { \
- uint8_t tmp1[w * (h + padding)]; \
- var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
- var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
- } \
- } else { \
- uint8_t tmp0[w * (h + padding)]; \
- if (yoffset == 0) { \
- var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
- return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
- } else if (yoffset == 4) { \
- uint8_t tmp1[w * h]; \
- var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
- xoffset); \
- var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
- } else { \
- uint8_t tmp1[w * h]; \
- var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
- xoffset); \
- var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
- } \
- } \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
}
SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
@@ -279,6 +279,36 @@ static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
} while (i != 0);
}
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 4.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *dst_ptr, int src_stride,
@@ -307,6 +337,35 @@ static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
} while (--i > 0);
}
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 8.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint8x8_t p = vld1_u8(second_pred);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
// Combine bilinear filter with aom_comp_avg_pred for large blocks.
static void avg_pred_var_filter_block2d_bil_large(
const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
@@ -342,6 +401,43 @@ static void avg_pred_var_filter_block2d_bil_large(
} while (--i != 0);
}
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
static void avg_pred_var_filter_block2d_bil_w16(
const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
@@ -378,6 +474,46 @@ static void avg_pred_var_filter_block2d_bil_w128(
filter_offset, second_pred);
}
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
// Combine averaging subpel filter with aom_comp_avg_pred.
static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
uint8_t *dst_ptr, int src_stride,
@@ -409,6 +545,37 @@ static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
} while (--i != 0);
}
+// Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred.
+static void dist_wtd_avg_pred_var_filter_block2d_avg(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
// Implementation of aom_comp_avg_pred for blocks having width >= 16.
static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
int dst_width, int dst_height,
@@ -436,6 +603,36 @@ static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
} while (--i != 0);
}
+// Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16.
+static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
const uint8_t *src, int source_stride, int xoffset, int yoffset, \
@@ -459,53 +656,53 @@ static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
uint8_t tmp[w * h]; \
if (yoffset == 0) { \
avg_pred(src, tmp, source_stride, w, h, second_pred); \
- return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
source_stride, w, h, second_pred); \
- return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
} else { \
avg_pred_var_filter_block2d_bil_w##w( \
src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
- return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
uint8_t tmp0[w * (h + padding)]; \
if (yoffset == 0) { \
avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
second_pred); \
- return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint8_t tmp1[w * (h + padding)]; \
var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} else { \
uint8_t tmp1[w * (h + padding)]; \
var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
second_pred); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} \
} else { \
uint8_t tmp0[w * (h + padding)]; \
if (yoffset == 0) { \
avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
xoffset, second_pred); \
- return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint8_t tmp1[w * h]; \
var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
(h + padding), xoffset); \
avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} else { \
uint8_t tmp1[w * h]; \
var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
(h + padding), xoffset); \
avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
second_pred); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} \
} \
}
@@ -550,6 +747,125 @@ SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
#undef SUBPEL_AVG_VARIANCE_WXH_NEON
#undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
+#define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred, \
+ jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred, \
+ jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp0, source_stride, 1, w, h, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+
#if !CONFIG_REALTIME_ONLY
#define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
@@ -665,7 +981,7 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
invert_mask); \
- return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
}
#define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
@@ -679,20 +995,20 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
if (yoffset == 0) { \
aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint8_t tmp1[w * h]; \
var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \
aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} else { \
uint8_t tmp1[w * h]; \
var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \
yoffset); \
aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
uint8_t tmp0[w * (h + padding)]; \
@@ -701,7 +1017,7 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint8_t tmp1[w * h]; \
uint8_t tmp2[w * h]; \
@@ -709,7 +1025,7 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
} else { \
uint8_t tmp1[w * h]; \
uint8_t tmp2[w * h]; \
@@ -717,7 +1033,7 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
} \
} else { \
if (yoffset == 0) { \
@@ -726,7 +1042,7 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint8_t tmp0[w * (h + padding)]; \
uint8_t tmp1[w * h]; \
@@ -736,7 +1052,7 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
} else { \
uint8_t tmp0[w * (h + padding)]; \
uint8_t tmp1[w * (h + padding)]; \
@@ -746,7 +1062,7 @@ SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
msk_stride, invert_mask); \
- return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
} \
} \
}
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index ff68c1296..b5a8b9706 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -8,6 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_
+#define AOM_AOM_DSP_ARM_SUM_NEON_H_
+
#include "config/aom_dsp_rtcd.h"
#include "config/aom_config.h"
@@ -62,7 +65,16 @@ static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
#endif
}
-static INLINE unsigned int horizontal_add_u32x4(const uint32x4_t a) {
+static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_s32(a);
+#else
+ const int64x2_t b = vpaddlq_s32(a);
+ return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) {
#if AOM_ARCH_AARCH64
return vaddvq_u32(a);
#else
@@ -88,6 +100,21 @@ static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
#endif
}
+static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ int32x4_t res01 = vpaddq_s32(sum[0], sum[1]);
+ int32x4_t res23 = vpaddq_s32(sum[2], sum[3]);
+ return vpaddq_s32(res01, res23);
+#else
+ int32x4_t res = vdupq_n_s32(0);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3);
+ return res;
+#endif
+}
+
static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
#if AOM_ARCH_AARCH64
@@ -186,3 +213,72 @@ static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
return vget_lane_u32(vreinterpret_u32_u64(c), 0);
#endif
}
+
+static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
+#if AOM_ARCH_AARCH64
+ return vpaddq_s32(a, b);
+#else
+ const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+ const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+ return vcombine_s32(a0, b0);
+#endif
+}
+
+static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vget_low_s32(vpaddq_s32(a, a));
+#else
+ return vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
+ return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
+ uint64x2_t sum = vpaddlq_u32(a[0]);
+ sum = vpadalq_u32(sum, a[1]);
+ sum = vpadalq_u32(sum, a[2]);
+ sum = vpadalq_u32(sum, a[3]);
+
+ return horizontal_add_u64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+
+ return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+ sum[0] = vpadalq_u32(sum[0], a[8]);
+ sum[1] = vpadalq_u32(sum[1], a[9]);
+ sum[0] = vpadalq_u32(sum[0], a[10]);
+ sum[1] = vpadalq_u32(sum[1], a[11]);
+ sum[0] = vpadalq_u32(sum[0], a[12]);
+ sum[1] = vpadalq_u32(sum[1], a[13]);
+ sum[0] = vpadalq_u32(sum[0], a[14]);
+ sum[1] = vpadalq_u32(sum[1], a[15]);
+
+ return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+#endif // AOM_AOM_DSP_ARM_SUM_NEON_H_
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index 626cf2170..424b2b444 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -287,130 +287,6 @@ uint64_t aom_sum_squares_i16_neon(const int16_t *src, uint32_t n) {
return aom_sum_squares_i16_c(src, n);
}
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
- int width, int height) {
- uint64_t sum = 0;
- uint64_t sse = 0;
- uint32x2_t sum_u32 = vdup_n_u32(0);
- uint32x2_t sse_u32 = vdup_n_u32(0);
-
- int h = height / 2;
- do {
- int w = width;
- uint8_t *src_ptr = src;
- do {
- uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
-
- sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
-
- sse_u32 = vdot_u32(sse_u32, s0, s0);
-
- src_ptr += 8;
- w -= 8;
- } while (w >= 8);
-
- // Process remaining columns in the row using C.
- while (w > 0) {
- int idx = width - w;
- const uint8_t v = src[idx];
- sum += v;
- sse += v * v;
- w--;
- }
-
- src += 2 * src_stride;
- } while (--h != 0);
-
- sum += horizontal_long_add_u32x2(sum_u32);
- sse += horizontal_long_add_u32x2(sse_u32);
-
- return sse - sum * sum / (width * height);
-}
-
-static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
- int width, int height) {
- uint64_t sum = 0;
- uint64_t sse = 0;
- uint32x2_t sum_u32 = vdup_n_u32(0);
- uint32x2_t sse_u32 = vdup_n_u32(0);
-
- int h = height;
- do {
- int w = width;
- uint8_t *src_ptr = src;
- do {
- uint8x8_t s0 = vld1_u8(src_ptr);
-
- sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
-
- sse_u32 = vdot_u32(sse_u32, s0, s0);
-
- src_ptr += 8;
- w -= 8;
- } while (w >= 8);
-
- // Process remaining columns in the row using C.
- while (w > 0) {
- int idx = width - w;
- const uint8_t v = src[idx];
- sum += v;
- sse += v * v;
- w--;
- }
-
- src += src_stride;
- } while (--h != 0);
-
- sum += horizontal_long_add_u32x2(sum_u32);
- sse += horizontal_long_add_u32x2(sse_u32);
-
- return sse - sum * sum / (width * height);
-}
-
-static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
- int width, int height) {
- uint64_t sum = 0;
- uint64_t sse = 0;
- uint32x4_t sum_u32 = vdupq_n_u32(0);
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int h = height;
- do {
- int w = width;
- uint8_t *src_ptr = src;
- do {
- uint8x16_t s0 = vld1q_u8(src_ptr);
-
- sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1));
-
- sse_u32 = vdotq_u32(sse_u32, s0, s0);
-
- src_ptr += 16;
- w -= 16;
- } while (w >= 16);
-
- // Process remaining columns in the row using C.
- while (w > 0) {
- int idx = width - w;
- const uint8_t v = src[idx];
- sum += v;
- sse += v * v;
- w--;
- }
-
- src += src_stride;
- } while (--h != 0);
-
- sum += horizontal_long_add_u32x4(sum_u32);
- sse += horizontal_long_add_u32x4(sse_u32);
-
- return sse - sum * sum / (width * height);
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
int width, int height) {
uint64_t sum = 0;
@@ -584,8 +460,6 @@ static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
return sse - sum * sum / (width * height);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width,
int height) {
if (width >= 16) {
diff --git a/aom_dsp/arm/sum_squares_neon_dotprod.c b/aom_dsp/arm/sum_squares_neon_dotprod.c
new file mode 100644
index 000000000..44462a693
--- /dev/null
+++ b/aom_dsp/arm/sum_squares_neon_dotprod.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
+ int src_stride, int width,
+ int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x2_t sse_u32 = vdup_n_u32(0);
+
+ int h = height / 2;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+ sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+ sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += 2 * src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x2(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
+ int src_stride, int width,
+ int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x2_t sse_u32 = vdup_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+
+ sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+ sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x2(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src,
+ int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1));
+
+ sse_u32 = vdotq_u32(sse_u32, s0, s0);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w >= 16);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x4(sum_u32);
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon_dotprod(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width >= 16) {
+ return aom_var_2d_u8_16xh_neon_dotprod(src, src_stride, width, height);
+ }
+ if (width >= 8) {
+ return aom_var_2d_u8_8xh_neon_dotprod(src, src_stride, width, height);
+ }
+ if (width >= 4 && height % 2 == 0) {
+ return aom_var_2d_u8_4xh_neon_dotprod(src, src_stride, width, height);
+ }
+ return aom_var_2d_u8_c(src, src_stride, width, height);
+}
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 8218140f5..b215f6aeb 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -13,16 +13,14 @@
#include <arm_neon.h>
+#include "aom/aom_integer.h" // For AOM_FORCE_INLINE.
#include "config/aom_config.h"
-// Swap high and low halves.
-static INLINE uint16x8_t transpose64_u16q(const uint16x8_t a) {
- return vextq_u16(a, a, 4);
-}
-
-static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
- uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
- uint8x8_t *a6, uint8x8_t *a7) {
+static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
+ uint8x8_t *a2, uint8x8_t *a3,
+ uint8x8_t *a4, uint8x8_t *a5,
+ uint8x8_t *a6,
+ uint8x8_t *a7) {
// Swap 8 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
@@ -74,8 +72,9 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}
-static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
- uint8x8_t *a3) {
+static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
+ uint8x8_t *a2,
+ uint8x8_t *a3) {
// Swap 8 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
@@ -107,7 +106,8 @@ static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
*a3 = vreinterpret_u8_u16(c1.val[1]);
}
-static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
+ uint8x8_t *a1) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 10 11 12 13
// a1: 20 21 22 23 30 31 32 33
@@ -136,10 +136,12 @@ static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
*a1 = d0.val[1];
}
-static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
- uint8x8_t *a3, const uint8x8_t a4,
- const uint8x8_t a5, const uint8x8_t a6,
- const uint8x8_t a7) {
+static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x8_t a4, uint8x8_t a5,
+ uint8x8_t a6, uint8x8_t a7,
+ uint8x8_t *o0, uint8x8_t *o1,
+ uint8x8_t *o2, uint8x8_t *o3) {
// Swap 32 bit elements. Goes from:
// a0: 00 01 02 03 XX XX XX XX
// a1: 10 11 12 13 XX XX XX XX
@@ -156,13 +158,13 @@ static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
// b3.val[0]: 30 31 32 33 70 71 72 73
const uint32x2x2_t b0 =
- vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+ vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
const uint32x2x2_t b1 =
- vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+ vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
const uint32x2x2_t b2 =
- vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+ vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
const uint32x2x2_t b3 =
- vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+ vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
// Swap 16 bit elements resulting in:
// c0.val[0]: 00 01 20 21 40 41 60 61
@@ -186,23 +188,19 @@ static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
const uint8x8x2_t d1 =
vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
- *a0 = d0.val[0];
- *a1 = d0.val[1];
- *a2 = d1.val[0];
- *a3 = d1.val[1];
+ *o0 = d0.val[0];
+ *o1 = d0.val[1];
+ *o2 = d1.val[0];
+ *o3 = d1.val[1];
}
-// Input:
-// 00 01 02 03
-// 10 11 12 13
-// 20 21 22 23
-// 30 31 32 33
-// Output:
-// 00 10 20 30
-// 01 11 21 31
-// 02 12 22 32
-// 03 13 23 33
-static INLINE void transpose_u16_4x4(uint16x4_t a[4]) {
+static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
+ // Input:
+ // 00 01 02 03
+ // 10 11 12 13
+ // 20 21 22 23
+ // 30 31 32 33
+
// b:
// 00 10 02 12
// 01 11 03 13
@@ -221,23 +219,25 @@ static INLINE void transpose_u16_4x4(uint16x4_t a[4]) {
// 03 13 23 33
const uint32x2x2_t e =
vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+
+ // Output:
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
a[0] = vreinterpret_u16_u32(d.val[0]);
a[1] = vreinterpret_u16_u32(e.val[0]);
a[2] = vreinterpret_u16_u32(d.val[1]);
a[3] = vreinterpret_u16_u32(e.val[1]);
}
-// 4x8 Input:
-// a[0]: 00 01 02 03 04 05 06 07
-// a[1]: 10 11 12 13 14 15 16 17
-// a[2]: 20 21 22 23 24 25 26 27
-// a[3]: 30 31 32 33 34 35 36 37
-// 8x4 Output:
-// a[0]: 00 10 20 30 04 14 24 34
-// a[1]: 01 11 21 31 05 15 25 35
-// a[2]: 02 12 22 32 06 16 26 36
-// a[3]: 03 13 23 33 07 17 27 37
-static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) {
+static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
+ // 4x8 Input:
+ // a[0]: 00 01 02 03 04 05 06 07
+ // a[1]: 10 11 12 13 14 15 16 17
+ // a[2]: 20 21 22 23 24 25 26 27
+ // a[3]: 30 31 32 33 34 35 36 37
+
// b0.val[0]: 00 10 02 12 04 14 06 16
// b0.val[1]: 01 11 03 13 05 15 07 17
// b1.val[0]: 20 30 22 32 24 34 26 36
@@ -254,6 +254,11 @@ static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) {
const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
vreinterpretq_u32_u16(b1.val[1]));
+ // 8x4 Output:
+ // a[0]: 00 10 20 30 04 14 24 34
+ // a[1]: 01 11 21 31 05 15 25 35
+ // a[2]: 02 12 22 32 06 16 26 36
+ // a[3]: 03 13 23 33 07 17 27 37
a[0] = vreinterpretq_u16_u32(c0.val[0]);
a[1] = vreinterpretq_u16_u32(c1.val[0]);
a[2] = vreinterpretq_u16_u32(c0.val[1]);
@@ -345,12 +350,11 @@ static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
a[3] = d0.val[0]; // p3q3
}
-static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
- uint16x4_t *a2, uint16x4_t *a3,
- uint16x4_t *a4, uint16x4_t *a5,
- uint16x4_t *a6, uint16x4_t *a7,
- uint16x8_t *o0, uint16x8_t *o1,
- uint16x8_t *o2, uint16x8_t *o3) {
+static INLINE void transpose_elems_u16_4x8(
+ const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
+ const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
+ const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
+ uint16x8_t *o2, uint16x8_t *o3) {
// Combine rows. Goes from:
// a0: 00 01 02 03
// a1: 10 11 12 13
@@ -366,10 +370,10 @@ static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
// b2: 20 21 22 23 60 61 62 63
// b3: 30 31 32 33 70 71 72 73
- const uint16x8_t b0 = vcombine_u16(*a0, *a4);
- const uint16x8_t b1 = vcombine_u16(*a1, *a5);
- const uint16x8_t b2 = vcombine_u16(*a2, *a6);
- const uint16x8_t b3 = vcombine_u16(*a3, *a7);
+ const uint16x8_t b0 = vcombine_u16(a0, a4);
+ const uint16x8_t b1 = vcombine_u16(a1, a5);
+ const uint16x8_t b2 = vcombine_u16(a2, a6);
+ const uint16x8_t b3 = vcombine_u16(a3, a7);
// Swap 16 bit elements resulting in:
// c0.val[0]: 00 10 02 12 40 50 42 52
@@ -397,12 +401,11 @@ static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
*o3 = vreinterpretq_u16_u32(d1.val[1]);
}
-static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
- int16x4_t *a2, int16x4_t *a3,
- int16x4_t *a4, int16x4_t *a5,
- int16x4_t *a6, int16x4_t *a7,
- int16x8_t *o0, int16x8_t *o1,
- int16x8_t *o2, int16x8_t *o3) {
+static INLINE void transpose_elems_s16_4x8(
+ const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
+ const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
+ const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
+ int16x8_t *o2, int16x8_t *o3) {
// Combine rows. Goes from:
// a0: 00 01 02 03
// a1: 10 11 12 13
@@ -418,10 +421,10 @@ static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
// b2: 20 21 22 23 60 61 62 63
// b3: 30 31 32 33 70 71 72 73
- const int16x8_t b0 = vcombine_s16(*a0, *a4);
- const int16x8_t b1 = vcombine_s16(*a1, *a5);
- const int16x8_t b2 = vcombine_s16(*a2, *a6);
- const int16x8_t b3 = vcombine_s16(*a3, *a7);
+ const int16x8_t b0 = vcombine_s16(a0, a4);
+ const int16x8_t b1 = vcombine_s16(a1, a5);
+ const int16x8_t b2 = vcombine_s16(a2, a6);
+ const int16x8_t b3 = vcombine_s16(a3, a7);
// Swap 16 bit elements resulting in:
// c0.val[0]: 00 10 02 12 40 50 42 52
@@ -449,10 +452,9 @@ static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
*o3 = vreinterpretq_s16_s32(d1.val[1]);
}
-static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
- uint16x8_t *a2, uint16x8_t *a3,
- uint16x8_t *a4, uint16x8_t *a5,
- uint16x8_t *a6, uint16x8_t *a7) {
+static INLINE void transpose_elems_inplace_u16_8x8(
+ uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
+ uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
@@ -537,10 +539,11 @@ static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
return b0;
}
-static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
- int16x8_t *a2, int16x8_t *a3,
- int16x8_t *a4, int16x8_t *a5,
- int16x8_t *a6, int16x8_t *a7) {
+static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6,
+ int16x8_t *a7) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
@@ -609,7 +612,8 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
*a7 = d3.val[1];
}
-static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
+ int16x8_t *out) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
@@ -678,8 +682,10 @@ static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
out[7] = d3.val[1];
}
-static INLINE void transpose_u16_4x4d(uint16x4_t *a0, uint16x4_t *a1,
- uint16x4_t *a2, uint16x4_t *a3) {
+static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
+ uint16x4_t *a1,
+ uint16x4_t *a2,
+ uint16x4_t *a3) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03
// a1: 10 11 12 13
@@ -711,8 +717,9 @@ static INLINE void transpose_u16_4x4d(uint16x4_t *a0, uint16x4_t *a1,
*a3 = vreinterpret_u16_u32(c1.val[1]);
}
-static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
- int16x4_t *a2, int16x4_t *a3) {
+static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2,
+ int16x4_t *a3) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03
// a1: 10 11 12 13
@@ -758,8 +765,12 @@ static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
return b0;
}
-static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
- int32x4_t *a2, int32x4_t *a3) {
+static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
+ const int32x4_t a1,
+ const int32x4_t a2,
+ const int32x4_t a3, int32x4_t *o0,
+ int32x4_t *o1, int32x4_t *o2,
+ int32x4_t *o3) {
// Swap 32 bit elements. Goes from:
// a0: 00 01 02 03
// a1: 10 11 12 13
@@ -771,8 +782,8 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
// b1.val[0]: 20 30 22 32
// b1.val[1]: 21 31 23 33
- const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
- const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+ const int32x4x2_t b0 = vtrnq_s32(a0, a1);
+ const int32x4x2_t b1 = vtrnq_s32(a2, a3);
// Swap 64 bit elements resulting in:
// c0.val[0]: 00 10 20 30
@@ -783,10 +794,267 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
- *a0 = c0.val[0];
- *a1 = c1.val[0];
- *a2 = c0.val[1];
- *a3 = c1.val[1];
+ *o0 = c0.val[0];
+ *o1 = c1.val[0];
+ *o2 = c0.val[1];
+ *o3 = c1.val[1];
+}
+
+static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2,
+ int32x4_t *a3) {
+ transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
+}
+
+static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
+ int32x4_t *out) {
+ transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
+ &out[3]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
+ int32x4_t *out,
+ const int width,
+ const int height) {
+ const int h = height >> 2;
+ const int w = width >> 2;
+ for (int j = 0; j < w; j++) {
+ for (int i = 0; i < h; i++) {
+ transpose_arrays_s32_4x4(in + j * height + i * 4,
+ out + i * width + j * 4);
+ }
+ }
+}
+
+#define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \
+ static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
+ const int32x4_t *in, int32x4_t *out) { \
+ transpose_arrays_s32_4nx4n(in, out, w, h); \
+ }
+
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
+
+#undef TRANSPOSE_ARRAYS_S32_WXH_NEON
+
+static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+ return vtrn1q_s64(a, b);
+#else
+ return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
+#endif
+}
+
+static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+ return vtrn2q_s64(a, b);
+#else
+ return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
+#endif
+}
+
+static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
+ int32x4_t a2, int32x4_t a3,
+ int32x4_t a4, int32x4_t a5,
+ int32x4_t a6, int32x4_t a7,
+ int32x4x2_t *o0, int32x4x2_t *o1,
+ int32x4x2_t *o2, int32x4x2_t *o3) {
+ // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
+ // matrix transpose implementation:
+ // [ A ]^T => [ A^T B^T ]
+ // [ B ]
+
+ transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T
+ transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T
+
+ o0->val[0] = a0;
+ o1->val[0] = a1;
+ o2->val[0] = a2;
+ o3->val[0] = a3;
+
+ o0->val[1] = a4;
+ o1->val[1] = a5;
+ o2->val[1] = a6;
+ o3->val[1] = a7;
+}
+
+static INLINE void transpose_elems_inplace_s32_8x8(
+ int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
+ int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
+ // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
+ // matrix transpose implementation:
+ // [ A B ]^T => [ A^T C^T ]
+ // [ C D ] [ B^T D^T ]
+
+ int32x4_t q0_v1 = a0->val[0];
+ int32x4_t q0_v2 = a1->val[0];
+ int32x4_t q0_v3 = a2->val[0];
+ int32x4_t q0_v4 = a3->val[0];
+
+ int32x4_t q1_v1 = a0->val[1];
+ int32x4_t q1_v2 = a1->val[1];
+ int32x4_t q1_v3 = a2->val[1];
+ int32x4_t q1_v4 = a3->val[1];
+
+ int32x4_t q2_v1 = a4->val[0];
+ int32x4_t q2_v2 = a5->val[0];
+ int32x4_t q2_v3 = a6->val[0];
+ int32x4_t q2_v4 = a7->val[0];
+
+ int32x4_t q3_v1 = a4->val[1];
+ int32x4_t q3_v2 = a5->val[1];
+ int32x4_t q3_v3 = a6->val[1];
+ int32x4_t q3_v4 = a7->val[1];
+
+ transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T
+ transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T
+ transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T
+ transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T
+
+ a0->val[0] = q0_v1;
+ a1->val[0] = q0_v2;
+ a2->val[0] = q0_v3;
+ a3->val[0] = q0_v4;
+
+ a0->val[1] = q2_v1;
+ a1->val[1] = q2_v2;
+ a2->val[1] = q2_v3;
+ a3->val[1] = q2_v4;
+
+ a4->val[0] = q1_v1;
+ a5->val[0] = q1_v2;
+ a6->val[0] = q1_v3;
+ a7->val[0] = q1_v4;
+
+ a4->val[1] = q3_v1;
+ a5->val[1] = q3_v2;
+ a6->val[1] = q3_v3;
+ a7->val[1] = q3_v4;
+}
+
+static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
+ int16x4_t *const out) {
+ int16x4_t a0 = in[0];
+ int16x4_t a1 = in[1];
+ int16x4_t a2 = in[2];
+ int16x4_t a3 = in[3];
+
+ transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+ out[0] = a0;
+ out[1] = a1;
+ out[2] = a2;
+ out[3] = a3;
+}
+
+static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
+ int16x8_t *const out) {
+#if AOM_ARCH_AARCH64
+ const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
+ vcombine_s16(in[1], vdup_n_s16(0)));
+ const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
+ vcombine_s16(in[3], vdup_n_s16(0)));
+ const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
+ vcombine_s16(in[5], vdup_n_s16(0)));
+ const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
+ vcombine_s16(in[7], vdup_n_s16(0)));
+#else
+ int16x4x2_t temp;
+ temp = vzip_s16(in[0], in[1]);
+ const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[2], in[3]);
+ const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[4], in[5]);
+ const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[6], in[7]);
+ const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+ const int32x4x2_t b02 =
+ vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+ const int32x4x2_t b13 =
+ vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
+
+#if AOM_ARCH_AARCH64
+ out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
+ vreinterpretq_s64_s32(b13.val[0])));
+ out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
+ vreinterpretq_s64_s32(b13.val[0])));
+ out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
+ vreinterpretq_s64_s32(b13.val[1])));
+ out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
+ vreinterpretq_s64_s32(b13.val[1])));
+#else
+ out[0] = vreinterpretq_s16_s32(
+ vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
+ out[2] = vreinterpretq_s16_s32(
+ vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
+ out[1] = vreinterpretq_s16_s32(
+ vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
+ out[3] = vreinterpretq_s16_s32(
+ vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
+#endif
+}
+
+static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
+ int16x4_t *const out) {
+ // Swap 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+ const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
+ vreinterpretq_u32_s16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
+ vreinterpretq_u32_s16(b1.val[1]));
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+
+ out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
+ out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
+ out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
+ out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
+ out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
+ out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
+ out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
+ out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
}
#endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 5e33996d2..9e4e8c0cf 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -11,153 +11,12 @@
#include <arm_neon.h>
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
+#include "aom/aom_integer.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
-#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
-
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- uint32x4_t src_sum = vdupq_n_u32(0);
- uint32x4_t ref_sum = vdupq_n_u32(0);
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h;
- do {
- uint8x16_t s = load_unaligned_u8q(src, src_stride);
- uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
-
- src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
- ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
- uint8x16_t abs_diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
- src += 4 * src_stride;
- ref += 4 * ref_stride;
- i -= 4;
- } while (i != 0);
-
- int32x4_t sum_diff =
- vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
- *sum = horizontal_add_s32x4(sum_diff);
- *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- uint32x4_t src_sum = vdupq_n_u32(0);
- uint32x4_t ref_sum = vdupq_n_u32(0);
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h;
- do {
- uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
- uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
-
- src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
- ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
- uint8x16_t abs_diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- i -= 2;
- } while (i != 0);
-
- int32x4_t sum_diff =
- vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
- *sum = horizontal_add_s32x4(sum_diff);
- *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- uint32x4_t src_sum = vdupq_n_u32(0);
- uint32x4_t ref_sum = vdupq_n_u32(0);
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h;
- do {
- uint8x16_t s = vld1q_u8(src);
- uint8x16_t r = vld1q_u8(ref);
-
- src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
- ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
- uint8x16_t abs_diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
- src += src_stride;
- ref += ref_stride;
- } while (--i != 0);
-
- int32x4_t sum_diff =
- vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
- *sum = horizontal_add_s32x4(sum_diff);
- *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int w, int h, uint32_t *sse, int *sum) {
- uint32x4_t src_sum = vdupq_n_u32(0);
- uint32x4_t ref_sum = vdupq_n_u32(0);
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h;
- do {
- int j = 0;
- do {
- uint8x16_t s = vld1q_u8(src + j);
- uint8x16_t r = vld1q_u8(ref + j);
-
- src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
- ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
- uint8x16_t abs_diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
- j += 16;
- } while (j < w);
-
- src += src_stride;
- ref += ref_stride;
- } while (--i != 0);
-
- int32x4_t sum_diff =
- vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
- *sum = horizontal_add_s32x4(sum_diff);
- *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
-}
-
-static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
-}
-
-static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int h, uint32_t *sse, int *sum) {
- variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse, sum);
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride, int h,
@@ -333,8 +192,6 @@ static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
#define VARIANCE_WXH_NEON(w, h, shift) \
unsigned int aom_variance##w##x##h##_neon( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -382,7 +239,7 @@ void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride,
uint32_t *sse8x8, int *sum8x8,
unsigned int *tot_sse, int *tot_sum,
uint32_t *var8x8) {
- // Loop over 4 8x8 blocks. Process one 8x32 block.
+ // Loop over four 8x8 blocks. Process one 8x32 block.
for (int k = 0; k < 4; k++) {
variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8,
&sse8x8[k], &sum8x8[k]);
@@ -390,8 +247,9 @@ void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride,
*tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
*tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
- for (int i = 0; i < 4; i++)
+ for (int i = 0; i < 4; i++) {
var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+ }
}
void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
@@ -400,7 +258,7 @@ void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
unsigned int *tot_sse, int *tot_sum,
uint32_t *var16x16) {
int sum16x16[2] = { 0 };
- // Loop over 2 16x16 blocks. Process one 16x32 block.
+ // Loop over two 16x16 blocks. Process one 16x32 block.
for (int k = 0; k < 2; k++) {
variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride,
16, &sse16x16[k], &sum16x16[k]);
@@ -408,65 +266,12 @@ void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
*tot_sse += sse16x16[0] + sse16x16[1];
*tot_sum += sum16x16[0] + sum16x16[1];
- for (int i = 0; i < 2; i++)
+ for (int i = 0; i < 2; i++) {
var16x16[i] =
sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+ }
}
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int h) {
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h;
- do {
- uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
- uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
-
- uint8x16_t abs_diff = vabdq_u8(s, r);
-
- sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- i -= 2;
- } while (i != 0);
-
- *sse = horizontal_add_u32x4(sse_u32);
- return horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int h) {
- uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h;
- do {
- uint8x16_t s0 = vld1q_u8(src);
- uint8x16_t s1 = vld1q_u8(src + src_stride);
- uint8x16_t r0 = vld1q_u8(ref);
- uint8x16_t r1 = vld1q_u8(ref + ref_stride);
-
- uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
- uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
-
- sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
- sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
-
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- i -= 2;
- } while (i != 0);
-
- *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
- return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int h) {
@@ -564,8 +369,6 @@ static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
#define MSE_WXH_NEON(w, h) \
unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
@@ -581,98 +384,87 @@ MSE_WXH_NEON(16, 16)
#undef MSE_WXH_NEON
-#define COMPUTE_MSE_16BIT(src_16x8, dst_16x8) \
- /* r7 r6 r5 r4 r3 r2 r1 r0 - 16 bit */ \
- const uint16x8_t diff = vabdq_u16(src_16x8, dst_16x8); \
- /*r3 r2 r1 r0 - 16 bit */ \
- const uint16x4_t res0_low_16x4 = vget_low_u16(diff); \
- /*r7 r6 r5 r4 - 16 bit */ \
- const uint16x4_t res0_high_16x4 = vget_high_u16(diff); \
- /* (r3*r3)= b3 (r2*r2)= b2 (r1*r1)= b1 (r0*r0)= b0 - 32 bit */ \
- const uint32x4_t res0_32x4 = vmull_u16(res0_low_16x4, res0_low_16x4); \
- /* (r7*r7)= b7 (r6*r6)= b6 (r5*r5)= b5 (r4*r4)= b4 - 32 bit*/ \
- /* b3+b7 b2+b6 b1+b5 b0+b4 - 32 bit*/ \
- const uint32x4_t res_32x4 = \
- vmlal_u16(res0_32x4, res0_high_16x4, res0_high_16x4); \
- \
- /*a1 a0 - 64 bit*/ \
- const uint64x2_t vl = vpaddlq_u32(res_32x4); \
- /*a1+a2= f1 a3+a0= f0*/ \
- square_result = vaddq_u64(square_result, vl);
-
-static AOM_INLINE uint64_t mse_4xh_16bit_neon(uint8_t *dst, int dstride,
- uint16_t *src, int sstride,
- int h) {
- uint64x2_t square_result = vdupq_n_u64(0);
- uint32_t d0, d1;
- int i = h;
- uint8_t *dst_ptr = dst;
- uint16_t *src_ptr = src;
- do {
- // d03 d02 d01 d00 - 8 bit
- memcpy(&d0, dst_ptr, 4);
- dst_ptr += dstride;
- // d13 d12 d11 d10 - 8 bit
- memcpy(&d1, dst_ptr, 4);
- dst_ptr += dstride;
- // duplication
- uint8x8_t tmp0_8x8 = vreinterpret_u8_u32(vdup_n_u32(d0));
- // d03 d02 d01 d00 - 16 bit
- const uint16x4_t dst0_16x4 = vget_low_u16(vmovl_u8(tmp0_8x8));
- // duplication
- tmp0_8x8 = vreinterpret_u8_u32(vdup_n_u32(d1));
- // d13 d12 d11 d10 - 16 bit
- const uint16x4_t dst1_16x4 = vget_low_u16(vmovl_u8(tmp0_8x8));
- // d13 d12 d11 d10 d03 d02 d01 d00 - 16 bit
- const uint16x8_t dst_16x8 = vcombine_u16(dst0_16x4, dst1_16x4);
-
- // b1r0 - s03 s02 s01 s00 - 16 bit
- const uint16x4_t src0_16x4 = vld1_u16(src_ptr);
- src_ptr += sstride;
- // b1r1 - s13 s12 s11 s10 - 16 bit
- const uint16x4_t src1_16x4 = vld1_u16(src_ptr);
- src_ptr += sstride;
- // s13 s12 s11 s10 s03 s02 s01 s00 - 16 bit
- const uint16x8_t src_16x8 = vcombine_u16(src0_16x4, src1_16x4);
-
- COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
- i -= 2;
- } while (i != 0);
- uint64x1_t sum =
- vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));
- return vget_lane_u64(sum, 0);
+static INLINE uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
+ uint16x8_t s0, uint16x8_t s1,
+ uint8x8_t d0, uint8x8_t d1) {
+ int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0));
+ int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1));
+
+ int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0));
+ mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0));
+ mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1));
+ mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1));
+
+ return vpadalq_u32(sum, vreinterpretq_u32_s32(mse));
}
-static AOM_INLINE uint64_t mse_8xh_16bit_neon(uint8_t *dst, int dstride,
- uint16_t *src, int sstride,
- int h) {
- uint64x2_t square_result = vdupq_n_u64(0);
- int i = h;
- do {
- // d7 d6 d5 d4 d3 d2 d1 d0 - 8 bit
- const uint16x8_t dst_16x8 = vmovl_u8(vld1_u8(dst));
- // s7 s6 s5 s4 s3 s2 s1 s0 - 16 bit
- const uint16x8_t src_16x8 = vld1q_u16(src);
+static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
- COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
+ uint64x2_t sum = vdupq_n_u64(0);
- dst += dstride;
- src += sstride;
- } while (--i != 0);
- uint64x1_t sum =
- vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));
- return vget_lane_u64(sum, 0);
+ if (w == 8) {
+ do {
+ uint8x8_t d0 = vld1_u8(dst + 0 * dstride);
+ uint8x8_t d1 = vld1_u8(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ do {
+ uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride);
+ uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return sum;
}
// Computes mse for a given block size. This function gets called for specific
// block sizes, which are 8x8, 8x4, 4x8 and 4x4.
uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
int sstride, int w, int h) {
- assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
- "w=8/4 and h=8/4 must satisfy");
- switch (w) {
- case 4: return mse_4xh_16bit_neon(dst, dstride, src, sstride, h);
- case 8: return mse_8xh_16bit_neon(dst, dstride, src, sstride, h);
- default: assert(0 && "unsupported width"); return -1;
+ return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h));
+}
+
+uint32_t aom_get_mb_ss_neon(const int16_t *a) {
+ int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ for (int i = 0; i < 256; i = i + 8) {
+ int16x8_t a_s16 = vld1q_s16(a + i);
+
+ sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16));
+ sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16));
}
+
+ return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1]));
+}
+
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h) {
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ int num_blks = 16 / w;
+ do {
+ sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h));
+ dst += w;
+ src += w * h;
+ } while (--num_blks != 0);
+
+ return horizontal_add_u64x2(sum);
}
diff --git a/aom_dsp/arm/variance_neon_dotprod.c b/aom_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 000000000..9fb52e1df
--- /dev/null
+++ b/aom_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = load_unaligned_u8q(src, src_stride);
+ uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 4 * src_stride;
+ ref += 4 * ref_stride;
+ i -= 4;
+ } while (i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+ uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src + j);
+ uint8x16_t r = vld1q_u8(ref + j);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ j += 16;
+ } while (j < w);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+ sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+ sum);
+}
+
+static INLINE void variance_128xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 128, h, sse,
+ sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \
+ unsigned int aom_variance##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \
+ &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+VARIANCE_WXH_NEON_DOTPROD(4, 16, 6)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+VARIANCE_WXH_NEON_DOTPROD(8, 32, 8)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 4, 6)
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+VARIANCE_WXH_NEON_DOTPROD(16, 64, 10)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 8, 8)
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 16, 10)
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+VARIANCE_WXH_NEON_DOTPROD(64, 128, 13)
+
+VARIANCE_WXH_NEON_DOTPROD(128, 64, 13)
+VARIANCE_WXH_NEON_DOTPROD(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+void aom_get_var_sse_sum_8x8_quad_neon_dotprod(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over four 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ variance_8xh_neon_dotprod(src + (k * 8), src_stride, ref + (k * 8),
+ ref_stride, 8, &sse8x8[k], &sum8x8[k]);
+ }
+
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++) {
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+ }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon_dotprod(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over two 16x16 blocks. Process one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ variance_16xh_neon_dotprod(src + (k * 16), src_stride, ref + (k * 16),
+ ref_stride, 16, &sse16x16[k], &sum16x16[k]);
+ }
+
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++) {
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+ }
+}
+
+static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride,
+ unsigned int *sse, int h) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+ uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE unsigned int mse16xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride,
+ unsigned int *sse, int h) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t s1 = vld1q_u8(src + src_stride);
+ uint8x16_t r0 = vld1q_u8(ref);
+ uint8x16_t r1 = vld1q_u8(ref + ref_stride);
+
+ uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
+ uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
+
+ sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
+ sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+#define MSE_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_mse##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ return mse##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, sse, h); \
+ }
+
+MSE_WXH_NEON_DOTPROD(8, 8)
+MSE_WXH_NEON_DOTPROD(8, 16)
+
+MSE_WXH_NEON_DOTPROD(16, 8)
+MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef MSE_WXH_NEON_DOTPROD
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 7b36bf31b..893f9c2f6 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -504,14 +504,14 @@ void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
}
#endif // CONFIG_AV1_HIGHBITDEPTH
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
int aom_satd_c(const tran_low_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i) satd += abs(coeff[i]);
- // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+ // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
return satd;
}
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h
index 467e47bf5..d26f027ed 100644
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -13,7 +13,7 @@
#define AOM_AOM_DSP_ENTENC_H_
#include <stddef.h>
#include "aom_dsp/entcode.h"
-#include "aom_ports/bitops.h"
+#include "aom_util/endian_inl.h"
#ifdef __cplusplus
extern "C" {
@@ -87,13 +87,14 @@ static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
} while (carry);
}
-// Reverse byte order and write data to buffer adding the carry-bit
+// Convert to big-endian byte order and write data to buffer adding the
+// carry-bit
static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out,
uint32_t offs, uint64_t output,
uint64_t carry,
uint32_t *enc_offs,
uint8_t num_bytes_ready) {
- const uint64_t reg = get_byteswap64(output) >> ((8 - num_bytes_ready) << 3);
+ const uint64_t reg = HToBE64(output << ((8 - num_bytes_ready) << 3));
memcpy(&out[offs], &reg, 8);
// Propagate carry backwards if exists
if (carry) {
diff --git a/aom_dsp/fft.c b/aom_dsp/fft.c
index cad4a6563..a44dbf77b 100644
--- a/aom_dsp/fft.c
+++ b/aom_dsp/fft.c
@@ -11,6 +11,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/fft_common.h"
+#include "config/aom_dsp_rtcd.h"
static INLINE void simple_transpose(const float *A, float *B, int n) {
for (int y = 0; y < n; y++) {
diff --git a/aom_dsp/fft_common.h b/aom_dsp/fft_common.h
index 5137331ae..3de1a045e 100644
--- a/aom_dsp/fft_common.h
+++ b/aom_dsp/fft_common.h
@@ -47,10 +47,16 @@ typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
// Declare some of the forward non-vectorized transforms which are used in some
// of the vectorized implementations
+void aom_fft1d_2_float(const float *input, float *output, int stride);
void aom_fft1d_4_float(const float *input, float *output, int stride);
void aom_fft1d_8_float(const float *input, float *output, int stride);
void aom_fft1d_16_float(const float *input, float *output, int stride);
void aom_fft1d_32_float(const float *input, float *output, int stride);
+void aom_ifft1d_2_float(const float *input, float *output, int stride);
+void aom_ifft1d_4_float(const float *input, float *output, int stride);
+void aom_ifft1d_8_float(const float *input, float *output, int stride);
+void aom_ifft1d_16_float(const float *input, float *output, int stride);
+void aom_ifft1d_32_float(const float *input, float *output, int stride);
/**\!brief Function pointer for transposing a matrix of floats.
*
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.c b/aom_dsp/flow_estimation/arm/disflow_neon.c
new file mode 100644
index 000000000..f09136606
--- /dev/null
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void get_cubic_kernel_dbl(double x, double *kernel) {
+ // Check that the fractional position is in range.
+ //
+ // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+ // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+ // possible to have x == 1 due to floating point rounding. This is fine,
+ // and we still interpolate correctly if we allow x = 1.
+ assert(0 <= x && x <= 1);
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+ kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+ kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+ kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int *kernel) {
+ double kernel_dbl[4];
+ get_cubic_kernel_dbl(x, kernel_dbl);
+
+ kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+ kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+ kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+ kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
+ int width, int height, int stride, int x,
+ int y, double u, double v, int16_t *dt) {
+ // Split offset into integer and fractional parts, and compute cubic
+ // interpolation kernels
+ const int u_int = (int)floor(u);
+ const int v_int = (int)floor(v);
+ const double u_frac = u - floor(u);
+ const double v_frac = v - floor(v);
+
+ int h_kernel[4];
+ int v_kernel[4];
+ get_cubic_kernel_int(u_frac, h_kernel);
+ get_cubic_kernel_int(v_frac, v_kernel);
+
+ int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+
+ // Clamp coordinates so that all pixels we fetch will remain within the
+ // allocated border region, but allow them to go far enough out that
+ // the border pixels' values do not change.
+ // Since we are calculating an 8x8 block, the bottom-right pixel
+ // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+ // interpolation has 4 taps, meaning that the output of pixel
+ // (x_w, y_w) depends on the pixels in the range
+ // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+ //
+ // Thus the most extreme coordinates which will be fetched are
+ // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+ const int x0 = clamp(x + u_int, -9, width);
+ const int y0 = clamp(y + v_int, -9, height);
+
+ // Horizontal convolution.
+ const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1);
+ int16x4_t h_filter = vmovn_s32(vld1q_s32(h_kernel));
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) {
+ uint8x16_t r = vld1q_u8(ref_start + i * stride);
+ uint16x8_t r0 = vmovl_u8(vget_low_u8(r));
+ uint16x8_t r1 = vmovl_u8(vget_high_u8(r));
+
+ int16x8_t s0 = vreinterpretq_s16_u16(r0);
+ int16x8_t s1 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 3));
+
+ int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), h_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), h_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), h_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), h_filter, 3);
+
+ int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), h_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), h_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), h_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), h_filter, 3);
+
+ // 6 is the maximum allowable number of extra bits which will avoid
+ // the intermediate values overflowing an int16_t. The most extreme
+ // intermediate value occurs when:
+ // * The input pixels are [0, 255, 255, 0]
+ // * u_frac = 0.5
+ // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+ // As an integer with 6 fractional bits, that is 18360, which fits
+ // in an int16_t. But with 7 fractional bits it would be 36720,
+ // which is too large.
+
+ int16x8_t sum = vcombine_s16(vrshrn_n_s32(sum_lo, DISFLOW_INTERP_BITS - 6),
+ vrshrn_n_s32(sum_hi, DISFLOW_INTERP_BITS - 6));
+ vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, sum);
+ }
+
+ // Vertical convolution.
+ int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel));
+ int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE;
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+ int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE);
+ int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE);
+ int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE);
+ int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE);
+
+ int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3);
+
+ int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3);
+
+ uint8x8_t s = vld1_u8(src + (i + y) * stride + x);
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3));
+
+ // This time, we have to round off the 6 extra bits which were kept
+ // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+ // of precision to match the scale of the dx and dy arrays.
+ sum_lo = vrshrq_n_s32(sum_lo,
+ DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+ sum_hi = vrshrq_n_s32(sum_hi,
+ DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+ int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16));
+ int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16));
+ vst1q_s16(dt + i * DISFLOW_PATCH_SIZE,
+ vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi)));
+ }
+}
+
+static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+ int16_t *dst, int dst_stride) {
+ int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+ // Horizontal filter, using kernel {1, 0, -1}.
+ const uint8_t *src_start = src - 1 * src_stride - 1;
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+ uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+ uint8x8_t s0 = vget_low_u8(s);
+ uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+ // Given that the kernel is {1, 0, -1} the convolution is a simple
+ // subtraction.
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2));
+
+ vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff);
+ }
+
+ // Vertical filter, using kernel {1, 2, 1}.
+ // This kernel can be split into two 2-taps kernels of value {1, 1}.
+ // That way we need only 3 add operations to perform the convolution, one of
+ // which can be reused for the next line.
+ int16x8_t s0 = vld1q_s16(tmp);
+ int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE);
+ int16x8_t sum01 = vaddq_s16(s0, s1);
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE);
+
+ int16x8_t sum12 = vaddq_s16(s1, s2);
+ int16x8_t sum = vaddq_s16(sum01, sum12);
+
+ vst1q_s16(dst + i * dst_stride, sum);
+
+ sum01 = sum12;
+ s1 = s2;
+ }
+}
+
+static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+ int16_t *dst, int dst_stride) {
+ int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+ // Horizontal filter, using kernel {1, 2, 1}.
+ // This kernel can be split into two 2-taps kernels of value {1, 1}.
+ // That way we need only 3 add operations to perform the convolution.
+ const uint8_t *src_start = src - 1 * src_stride - 1;
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+ uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+ uint8x8_t s0 = vget_low_u8(s);
+ uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1));
+ uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+ uint16x8_t sum01 = vaddl_u8(s0, s1);
+ uint16x8_t sum12 = vaddl_u8(s1, s2);
+ uint16x8_t sum = vaddq_u16(sum01, sum12);
+
+ vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum));
+ }
+
+ // Vertical filter, using kernel {1, 0, -1}.
+ // Load the whole block at once to avoid redundant loads during convolution.
+ int16x8_t t[10];
+ load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4],
+ &t[5], &t[6], &t[7], &t[8], &t[9]);
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ // Given that the kernel is {1, 0, -1} the convolution is a simple
+ // subtraction.
+ int16x8_t diff = vsubq_s16(t[i], t[i + 2]);
+
+ vst1q_s16(dst + i * dst_stride, diff);
+ }
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector.
+//
+// The flow equations are a least-squares system, derived as follows:
+//
+// For each pixel in the patch, we calculate the current error `dt`,
+// and the x and y gradients `dx` and `dy` of the source patch.
+// This means that, to first order, the squared error for this pixel is
+//
+// (dt + u * dx + v * dy)^2
+//
+// where (u, v) are the incremental changes to the flow vector.
+//
+// We then want to find the values of u and v which minimize the sum
+// of the squared error across all pixels. Conveniently, this fits exactly
+// into the form of a least squares problem, with one equation
+//
+// u * dx + v * dy = -dt
+//
+// for each pixel.
+//
+// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE,
+// and absorbing the - sign elsewhere, this results in the least squares system
+//
+// M = |sum(dx * dx) sum(dx * dy)|
+// |sum(dx * dy) sum(dy * dy)|
+//
+// b = |sum(dx * dt)|
+// |sum(dy * dt)|
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+ const int16_t *dy, int dy_stride,
+ double *M_inv) {
+ int32x4_t sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ int16x8_t x = vld1q_s16(dx + i * dx_stride);
+ int16x8_t y = vld1q_s16(dy + i * dy_stride);
+ sum[0] = vmlal_s16(sum[0], vget_low_s16(x), vget_low_s16(x));
+ sum[0] = vmlal_s16(sum[0], vget_high_s16(x), vget_high_s16(x));
+
+ sum[1] = vmlal_s16(sum[1], vget_low_s16(x), vget_low_s16(y));
+ sum[1] = vmlal_s16(sum[1], vget_high_s16(x), vget_high_s16(y));
+
+ sum[3] = vmlal_s16(sum[3], vget_low_s16(y), vget_low_s16(y));
+ sum[3] = vmlal_s16(sum[3], vget_high_s16(y), vget_high_s16(y));
+ }
+ sum[2] = sum[1];
+
+ int32x4_t res = horizontal_add_4d_s32x4(sum);
+
+ // Apply regularization
+ // We follow the standard regularization method of adding `k * I` before
+ // inverting. This ensures that the matrix will be invertible.
+ //
+ // Setting the regularization strength k to 1 seems to work well here, as
+ // typical values coming from the other equations are very large (1e5 to
+ // 1e6, with an upper limit of around 6e7, at the time of writing).
+ // It also preserves the property that all matrix values are whole numbers,
+ // which is convenient for integerized SIMD implementation.
+
+ double M0 = (double)vgetq_lane_s32(res, 0) + 1;
+ double M1 = (double)vgetq_lane_s32(res, 1);
+ double M2 = (double)vgetq_lane_s32(res, 2);
+ double M3 = (double)vgetq_lane_s32(res, 3) + 1;
+
+ // Invert matrix M.
+ double det = (M0 * M3) - (M1 * M2);
+ assert(det >= 1);
+ const double det_inv = 1 / det;
+
+ M_inv[0] = M3 * det_inv;
+ M_inv[1] = -M1 * det_inv;
+ M_inv[2] = -M2 * det_inv;
+ M_inv[3] = M0 * det_inv;
+}
+
+static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
+ const int16_t *dy, int dy_stride,
+ const int16_t *dt, int dt_stride,
+ int *b) {
+ int32x4_t b_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ int16x8_t dx16 = vld1q_s16(dx + i * dx_stride);
+ int16x8_t dy16 = vld1q_s16(dy + i * dy_stride);
+ int16x8_t dt16 = vld1q_s16(dt + i * dt_stride);
+
+ b_s32[0] = vmlal_s16(b_s32[0], vget_low_s16(dx16), vget_low_s16(dt16));
+ b_s32[0] = vmlal_s16(b_s32[0], vget_high_s16(dx16), vget_high_s16(dt16));
+
+ b_s32[1] = vmlal_s16(b_s32[1], vget_low_s16(dy16), vget_low_s16(dt16));
+ b_s32[1] = vmlal_s16(b_s32[1], vget_high_s16(dy16), vget_high_s16(dt16));
+ }
+
+ int32x4_t b_red = horizontal_add_2d_s32(b_s32[0], b_s32[1]);
+ vst1_s32(b, add_pairwise_s32x4(b_red));
+}
+
+void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref,
+ int x, int y, int width, int height,
+ int stride, double *u, double *v) {
+ double M_inv[4];
+ int b[2];
+ int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+ int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+ int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+ // Compute gradients within this patch
+ const uint8_t *src_patch = &src[y * stride + x];
+ sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE);
+ sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE);
+
+ compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv);
+
+ for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+ compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
+ compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
+ DISFLOW_PATCH_SIZE, b);
+
+ // Solve flow equations to find a better estimate for the flow vector
+ // at this point
+ const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+ const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+ *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+ *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+ if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+ // Stop iteration when we're close to convergence
+ break;
+ }
+ }
+}
diff --git a/aom_dsp/flow_estimation/corner_detect.c b/aom_dsp/flow_estimation/corner_detect.c
index 784829576..284d1bd7b 100644
--- a/aom_dsp/flow_estimation/corner_detect.c
+++ b/aom_dsp/flow_estimation/corner_detect.c
@@ -24,10 +24,10 @@
#define FAST_BARRIER 18
-size_t av1_get_corner_list_size() { return sizeof(CornerList); }
+size_t av1_get_corner_list_size(void) { return sizeof(CornerList); }
-CornerList *av1_alloc_corner_list() {
- CornerList *corners = (CornerList *)aom_calloc(1, sizeof(CornerList));
+CornerList *av1_alloc_corner_list(void) {
+ CornerList *corners = (CornerList *)aom_calloc(1, sizeof(*corners));
if (!corners) {
return NULL;
}
@@ -39,7 +39,7 @@ CornerList *av1_alloc_corner_list() {
return corners;
}
-void compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
const uint8_t *buf = pyr->layers[0].buffer;
int width = pyr->layers[0].width;
int height = pyr->layers[0].height;
@@ -49,14 +49,14 @@ void compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
int num_corners;
xy *const frame_corners_xy = aom_fast9_detect_nonmax(
buf, width, height, stride, FAST_BARRIER, &scores, &num_corners);
+ if (num_corners < 0) return false;
- if (num_corners <= 0) {
- // Some error occured, so no corners are available
- corners->num_corners = 0;
- } else if (num_corners <= MAX_CORNERS) {
+ if (num_corners <= MAX_CORNERS) {
// Use all detected corners
- memcpy(corners->corners, frame_corners_xy,
- sizeof(*frame_corners_xy) * num_corners);
+ if (num_corners != 0) {
+ memcpy(corners->corners, frame_corners_xy,
+ sizeof(*frame_corners_xy) * num_corners);
+ }
corners->num_corners = num_corners;
} else {
// There are more than MAX_CORNERS corners avilable, so pick out a subset
@@ -96,9 +96,10 @@ void compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
free(scores);
free(frame_corners_xy);
+ return true;
}
-void av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
assert(corners);
#if CONFIG_MULTITHREAD
@@ -106,13 +107,14 @@ void av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
#endif // CONFIG_MULTITHREAD
if (!corners->valid) {
- compute_corner_list(pyr, corners);
- corners->valid = true;
+ corners->valid = compute_corner_list(pyr, corners);
}
+ bool valid = corners->valid;
#if CONFIG_MULTITHREAD
pthread_mutex_unlock(&corners->mutex);
#endif // CONFIG_MULTITHREAD
+ return valid;
}
#ifndef NDEBUG
diff --git a/aom_dsp/flow_estimation/corner_detect.h b/aom_dsp/flow_estimation/corner_detect.h
index c77813ef8..d05846ce5 100644
--- a/aom_dsp/flow_estimation/corner_detect.h
+++ b/aom_dsp/flow_estimation/corner_detect.h
@@ -53,11 +53,11 @@ typedef struct corner_list {
int corners[2 * MAX_CORNERS];
} CornerList;
-size_t av1_get_corner_list_size();
+size_t av1_get_corner_list_size(void);
-CornerList *av1_alloc_corner_list();
+CornerList *av1_alloc_corner_list(void);
-void av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners);
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners);
#ifndef NDEBUG
// Check if a corner list has already been computed.
diff --git a/aom_dsp/flow_estimation/corner_match.c b/aom_dsp/flow_estimation/corner_match.c
index f34178e75..cef719b68 100644
--- a/aom_dsp/flow_estimation/corner_match.c
+++ b/aom_dsp/flow_estimation/corner_match.c
@@ -147,13 +147,13 @@ static void improve_correspondence(const unsigned char *src,
}
}
-int aom_determine_correspondence(const unsigned char *src,
- const int *src_corners, int num_src_corners,
- const unsigned char *ref,
- const int *ref_corners, int num_ref_corners,
- int width, int height, int src_stride,
- int ref_stride,
- Correspondence *correspondences) {
+static int determine_correspondence(const unsigned char *src,
+ const int *src_corners, int num_src_corners,
+ const unsigned char *ref,
+ const int *ref_corners, int num_ref_corners,
+ int width, int height, int src_stride,
+ int ref_stride,
+ Correspondence *correspondences) {
// TODO(sarahparker) Improve this to include 2-way match
int i, j;
int num_correspondences = 0;
@@ -202,7 +202,8 @@ int aom_determine_correspondence(const unsigned char *src,
bool av1_compute_global_motion_feature_match(
TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
- int bit_depth, MotionModel *motion_models, int num_motion_models) {
+ int bit_depth, MotionModel *motion_models, int num_motion_models,
+ bool *mem_alloc_failed) {
int num_correspondences;
Correspondence *correspondences;
ImagePyramid *src_pyramid = src->y_pyramid;
@@ -211,10 +212,22 @@ bool av1_compute_global_motion_feature_match(
CornerList *ref_corners = ref->corners;
// Precompute information we will need about each frame
- aom_compute_pyramid(src, bit_depth, src_pyramid);
- av1_compute_corner_list(src_pyramid, src_corners);
- aom_compute_pyramid(ref, bit_depth, ref_pyramid);
- av1_compute_corner_list(ref_pyramid, ref_corners);
+ if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
const uint8_t *src_buffer = src_pyramid->layers[0].buffer;
const int src_width = src_pyramid->layers[0].width;
@@ -229,14 +242,17 @@ bool av1_compute_global_motion_feature_match(
// find correspondences between the two images
correspondences = (Correspondence *)aom_malloc(src_corners->num_corners *
sizeof(*correspondences));
- if (!correspondences) return false;
- num_correspondences = aom_determine_correspondence(
+ if (!correspondences) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ num_correspondences = determine_correspondence(
src_buffer, src_corners->corners, src_corners->num_corners, ref_buffer,
ref_corners->corners, ref_corners->num_corners, src_width, src_height,
src_stride, ref_stride, correspondences);
bool result = ransac(correspondences, num_correspondences, type,
- motion_models, num_motion_models);
+ motion_models, num_motion_models, mem_alloc_failed);
aom_free(correspondences);
return result;
diff --git a/aom_dsp/flow_estimation/corner_match.h b/aom_dsp/flow_estimation/corner_match.h
index bb6994400..4435d2c76 100644
--- a/aom_dsp/flow_estimation/corner_match.h
+++ b/aom_dsp/flow_estimation/corner_match.h
@@ -29,17 +29,10 @@ extern "C" {
#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
-int aom_determine_correspondence(const unsigned char *src,
- const int *src_corners, int num_src_corners,
- const unsigned char *ref,
- const int *ref_corners, int num_ref_corners,
- int width, int height, int src_stride,
- int ref_stride,
- Correspondence *correspondences);
-
bool av1_compute_global_motion_feature_match(
TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
- int bit_depth, MotionModel *motion_models, int num_motion_models);
+ int bit_depth, MotionModel *motion_models, int num_motion_models,
+ bool *mem_alloc_failed);
#ifdef __cplusplus
}
diff --git a/aom_dsp/flow_estimation/disflow.c b/aom_dsp/flow_estimation/disflow.c
index a8e7b06f0..ed5559c75 100644
--- a/aom_dsp/flow_estimation/disflow.c
+++ b/aom_dsp/flow_estimation/disflow.c
@@ -53,7 +53,14 @@
#define UPSAMPLE_CENTER_OFFSET ((DOWNSAMPLE_FACTOR - 1) / 2)
static INLINE void get_cubic_kernel_dbl(double x, double *kernel) {
- assert(0 <= x && x < 1);
+ // Check that the fractional position is in range.
+ //
+ // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+ // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+ // possible to have x == 1 due to floating point rounding. This is fine,
+ // and we still interpolate correctly if we allow x = 1.
+ assert(0 <= x && x <= 1);
+
double x2 = x * x;
double x3 = x2 * x;
kernel[0] = -0.5 * x + x2 - 0.5 * x3;
@@ -154,9 +161,13 @@ static int determine_disflow_correspondence(CornerList *corners,
// (x, y) in src and the other at (x + u, y + v) in ref.
// This function returns the sum of squared pixel differences between
// the two regions.
-static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
- int width, int height, int stride, int x,
- int y, double u, double v, int16_t *dt) {
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+ int width, int height, int stride, int x,
+ int y, double u, double v,
+ const int16_t *dx, const int16_t *dy,
+ int *b) {
+ memset(b, 0, 2 * sizeof(*b));
+
// Split offset into integer and fractional parts, and compute cubic
// interpolation kernels
const int u_int = (int)floor(u);
@@ -230,8 +241,9 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2;
const int warped = ROUND_POWER_OF_TWO(result, round_bits);
const int src_px = src[(x + j) + (y + i) * stride] << 3;
- const int err = warped - src_px;
- dt[i * DISFLOW_PATCH_SIZE + j] = err;
+ const int dt = warped - src_px;
+ b[0] += dx[i * DISFLOW_PATCH_SIZE + j] * dt;
+ b[1] += dy[i * DISFLOW_PATCH_SIZE + j] * dt;
}
}
}
@@ -351,20 +363,6 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
M[3] = (double)tmp[3];
}
-static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
- const int16_t *dy, int dy_stride,
- const int16_t *dt, int dt_stride,
- int *b) {
- memset(b, 0, 2 * sizeof(*b));
-
- for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
- for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
- b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
- b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
- }
- }
-}
-
// Try to invert the matrix M
// Note: Due to the nature of how a least-squares matrix is constructed, all of
// the eigenvalues will be >= 0, and therefore det M >= 0 as well.
@@ -388,7 +386,6 @@ void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x,
double M[4];
double M_inv[4];
int b[2];
- int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
@@ -401,9 +398,8 @@ void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x,
invert_2x2(M, M_inv);
for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
- compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
- compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
- DISFLOW_PATCH_SIZE, b);
+ compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+ b);
// Solve flow equations to find a better estimate for the flow vector
// at this point
@@ -463,8 +459,9 @@ static void fill_flow_field_borders(double *flow, int width, int height,
}
// make sure flow_u and flow_v start at 0
-static void compute_flow_field(const ImagePyramid *src_pyr,
+static bool compute_flow_field(const ImagePyramid *src_pyr,
const ImagePyramid *ref_pyr, FlowField *flow) {
+ bool mem_status = true;
assert(src_pyr->n_levels == ref_pyr->n_levels);
double *flow_u = flow->u;
@@ -473,6 +470,10 @@ static void compute_flow_field(const ImagePyramid *src_pyr,
const size_t flow_size = flow->stride * (size_t)flow->height;
double *u_upscale = aom_malloc(flow_size * sizeof(*u_upscale));
double *v_upscale = aom_malloc(flow_size * sizeof(*v_upscale));
+ if (!u_upscale || !v_upscale) {
+ mem_status = false;
+ goto free_uvscale;
+ }
// Compute flow field from coarsest to finest level of the pyramid
for (int level = src_pyr->n_levels - 1; level >= 0; --level) {
@@ -522,12 +523,16 @@ static void compute_flow_field(const ImagePyramid *src_pyr,
const int upscale_flow_height = cur_flow_height << 1;
const int upscale_stride = flow->stride;
- av1_upscale_plane_double_prec(
+ bool upscale_u_plane = av1_upscale_plane_double_prec(
flow_u, cur_flow_height, cur_flow_width, cur_flow_stride, u_upscale,
upscale_flow_height, upscale_flow_width, upscale_stride);
- av1_upscale_plane_double_prec(
+ bool upscale_v_plane = av1_upscale_plane_double_prec(
flow_v, cur_flow_height, cur_flow_width, cur_flow_stride, v_upscale,
upscale_flow_height, upscale_flow_width, upscale_stride);
+ if (!upscale_u_plane || !upscale_v_plane) {
+ mem_status = false;
+ goto free_uvscale;
+ }
// Multiply all flow vectors by 2.
// When we move down a pyramid level, the image resolution doubles.
@@ -569,8 +574,10 @@ static void compute_flow_field(const ImagePyramid *src_pyr,
}
}
}
+free_uvscale:
aom_free(u_upscale);
aom_free(v_upscale);
+ return mem_status;
}
static FlowField *alloc_flow_field(int frame_width, int frame_height) {
@@ -612,14 +619,24 @@ bool av1_compute_global_motion_disflow(TransformationType type,
YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *ref, int bit_depth,
MotionModel *motion_models,
- int num_motion_models) {
+ int num_motion_models,
+ bool *mem_alloc_failed) {
// Precompute information we will need about each frame
ImagePyramid *src_pyramid = src->y_pyramid;
CornerList *src_corners = src->corners;
ImagePyramid *ref_pyramid = ref->y_pyramid;
- aom_compute_pyramid(src, bit_depth, src_pyramid);
- av1_compute_corner_list(src_pyramid, src_corners);
- aom_compute_pyramid(ref, bit_depth, ref_pyramid);
+ if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
const int src_width = src_pyramid->layers[0].width;
const int src_height = src_pyramid->layers[0].height;
@@ -627,14 +644,22 @@ bool av1_compute_global_motion_disflow(TransformationType type,
assert(ref_pyramid->layers[0].height == src_height);
FlowField *flow = alloc_flow_field(src_width, src_height);
- if (!flow) return false;
+ if (!flow) {
+ *mem_alloc_failed = true;
+ return false;
+ }
- compute_flow_field(src_pyramid, ref_pyramid, flow);
+ if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) {
+ *mem_alloc_failed = true;
+ free_flow_field(flow);
+ return false;
+ }
// find correspondences between the two images using the flow field
Correspondence *correspondences =
aom_malloc(src_corners->num_corners * sizeof(*correspondences));
if (!correspondences) {
+ *mem_alloc_failed = true;
free_flow_field(flow);
return false;
}
@@ -643,7 +668,7 @@ bool av1_compute_global_motion_disflow(TransformationType type,
determine_disflow_correspondence(src_corners, flow, correspondences);
bool result = ransac(correspondences, num_correspondences, type,
- motion_models, num_motion_models);
+ motion_models, num_motion_models, mem_alloc_failed);
aom_free(correspondences);
free_flow_field(flow);
diff --git a/aom_dsp/flow_estimation/disflow.h b/aom_dsp/flow_estimation/disflow.h
index 2e97ba221..d772c8a65 100644
--- a/aom_dsp/flow_estimation/disflow.h
+++ b/aom_dsp/flow_estimation/disflow.h
@@ -93,7 +93,8 @@ bool av1_compute_global_motion_disflow(TransformationType type,
YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *ref, int bit_depth,
MotionModel *motion_models,
- int num_motion_models);
+ int num_motion_models,
+ bool *mem_alloc_failed);
#ifdef __cplusplus
}
diff --git a/aom_dsp/flow_estimation/flow_estimation.c b/aom_dsp/flow_estimation/flow_estimation.c
index a6bf94200..0f47f86f5 100644
--- a/aom_dsp/flow_estimation/flow_estimation.c
+++ b/aom_dsp/flow_estimation/flow_estimation.c
@@ -44,15 +44,17 @@ bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *ref, int bit_depth,
GlobalMotionMethod gm_method,
MotionModel *motion_models,
- int num_motion_models) {
+ int num_motion_models, bool *mem_alloc_failed) {
switch (gm_method) {
case GLOBAL_MOTION_METHOD_FEATURE_MATCH:
return av1_compute_global_motion_feature_match(
- type, src, ref, bit_depth, motion_models, num_motion_models);
+ type, src, ref, bit_depth, motion_models, num_motion_models,
+ mem_alloc_failed);
case GLOBAL_MOTION_METHOD_DISFLOW:
- return av1_compute_global_motion_disflow(
- type, src, ref, bit_depth, motion_models, num_motion_models);
+ return av1_compute_global_motion_disflow(type, src, ref, bit_depth,
+ motion_models, num_motion_models,
+ mem_alloc_failed);
default: assert(0 && "Unknown global motion estimation type");
}
- return 0;
+ return false;
}
diff --git a/aom_dsp/flow_estimation/flow_estimation.h b/aom_dsp/flow_estimation/flow_estimation.h
index 4f2192cea..2dfae2498 100644
--- a/aom_dsp/flow_estimation/flow_estimation.h
+++ b/aom_dsp/flow_estimation/flow_estimation.h
@@ -86,7 +86,7 @@ bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *ref, int bit_depth,
GlobalMotionMethod gm_method,
MotionModel *motion_models,
- int num_motion_models);
+ int num_motion_models, bool *mem_alloc_failed);
#ifdef __cplusplus
}
diff --git a/aom_dsp/flow_estimation/ransac.c b/aom_dsp/flow_estimation/ransac.c
index 81c5f2c62..b88a07b02 100644
--- a/aom_dsp/flow_estimation/ransac.c
+++ b/aom_dsp/flow_estimation/ransac.c
@@ -246,7 +246,8 @@ static void copy_points_at_indices(double *dest, const double *src,
// Returns true on success, false on error
static bool ransac_internal(const Correspondence *matched_points, int npoints,
MotionModel *motion_models, int num_desired_motions,
- const RansacModelInfo *model_info) {
+ const RansacModelInfo *model_info,
+ bool *mem_alloc_failed) {
assert(npoints >= 0);
int i = 0;
int minpts = model_info->minpts;
@@ -297,6 +298,7 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints,
if (!(points1 && points2 && corners1 && corners2 && projected_corners &&
motions && inlier_buffer)) {
ret_val = false;
+ *mem_alloc_failed = true;
goto finish_ransac;
}
@@ -469,7 +471,7 @@ static const RansacModelInfo ransac_model_info[TRANS_TYPES] = {
// Returns true on success, false on error
bool ransac(const Correspondence *matched_points, int npoints,
TransformationType type, MotionModel *motion_models,
- int num_desired_motions) {
+ int num_desired_motions, bool *mem_alloc_failed) {
#if ALLOW_TRANSLATION_MODELS
assert(type > IDENTITY && type < TRANS_TYPES);
#else
@@ -477,5 +479,6 @@ bool ransac(const Correspondence *matched_points, int npoints,
#endif // ALLOW_TRANSLATION_MODELS
return ransac_internal(matched_points, npoints, motion_models,
- num_desired_motions, &ransac_model_info[type]);
+ num_desired_motions, &ransac_model_info[type],
+ mem_alloc_failed);
}
diff --git a/aom_dsp/flow_estimation/ransac.h b/aom_dsp/flow_estimation/ransac.h
index 604758024..0529b6e13 100644
--- a/aom_dsp/flow_estimation/ransac.h
+++ b/aom_dsp/flow_estimation/ransac.h
@@ -26,7 +26,7 @@ extern "C" {
bool ransac(const Correspondence *matched_points, int npoints,
TransformationType type, MotionModel *motion_models,
- int num_desired_motions);
+ int num_desired_motions, bool *mem_alloc_failed);
#ifdef __cplusplus
}
diff --git a/aom_dsp/flow_estimation/x86/disflow_sse4.c b/aom_dsp/flow_estimation/x86/disflow_sse4.c
index a62e9a4d9..3c2159a60 100644
--- a/aom_dsp/flow_estimation/x86/disflow_sse4.c
+++ b/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -28,7 +28,14 @@
// Note: Max sum(+ve coefficients) = 1.125 * scale
static INLINE void get_cubic_kernel_dbl(double x, double *kernel) {
- assert(0 <= x && x < 1);
+ // Check that the fractional position is in range.
+ //
+ // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+ // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+ // possible to have x == 1 due to floating point rounding. This is fine,
+ // and we still interpolate correctly if we allow x = 1.
+ assert(0 <= x && x <= 1);
+
double x2 = x * x;
double x3 = x2 * x;
kernel[0] = -0.5 * x + x2 - 0.5 * x3;
@@ -61,12 +68,23 @@ static INLINE int get_cubic_value_int(const int *p, const int16_t *kernel) {
//
// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation
// instad of bicubic interpolation
-static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
- int width, int height, int stride, int x,
- int y, double u, double v, int16_t *dt) {
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+ int width, int height, int stride, int x,
+ int y, double u, double v,
+ const int16_t *dx, const int16_t *dy,
+ int *b) {
// This function is written to do 8x8 convolutions only
assert(DISFLOW_PATCH_SIZE == 8);
+ // Accumulate 4 32-bit partial sums for each element of b
+ // These will be flattened at the end.
+ __m128i b0_acc = _mm_setzero_si128();
+ __m128i b1_acc = _mm_setzero_si128();
+#if CHECK_RESULTS
+ // Also keep a running sum using the C algorithm, for cross-checking
+ int c_result[2] = { 0 };
+#endif // CHECK_RESULTS
+
// Split offset into integer and fractional parts, and compute cubic
// interpolation kernels
const int u_int = (int)floor(u);
@@ -231,10 +249,20 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
__m128i src_pixels = _mm_slli_epi16(_mm_cvtepu8_epi16(src_pixels_u8), 3);
// Calculate delta from the target patch
- __m128i err = _mm_sub_epi16(warped, src_pixels);
- _mm_storeu_si128((__m128i *)&dt[i * DISFLOW_PATCH_SIZE], err);
+ __m128i dt = _mm_sub_epi16(warped, src_pixels);
+
+ // Load 8 elements each of dx and dt, to pair with the 8 elements of dt
+ // that we have just computed. Then compute 8 partial sums of dx * dt
+ // and dy * dt, implicitly sum to give 4 partial sums of each, and
+ // accumulate.
+ __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * DISFLOW_PATCH_SIZE]);
+ __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]);
+ b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt));
+ b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt));
#if CHECK_RESULTS
+ int16_t dt_arr[8];
+ memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr));
for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j];
int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE],
@@ -247,12 +275,28 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
// of precision to match the scale of the dx and dy arrays.
const int c_warped = ROUND_POWER_OF_TWO(result, round_bits);
const int c_src_px = src[(x + j) + (y + i) * stride] << 3;
- const int c_err = c_warped - c_src_px;
- (void)c_err;
- assert(dt[i * DISFLOW_PATCH_SIZE + j] == c_err);
+ const int c_dt = c_warped - c_src_px;
+
+ assert(dt_arr[j] == c_dt);
+
+ c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt;
+ c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt;
}
#endif // CHECK_RESULTS
}
+
+ // Flatten the two sets of partial sums to find the final value of b
+ // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc).
+ // We need to do 6 additions in total; a `hadd` instruction can take care
+ // of four of them, leaving two scalar additions.
+ __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc);
+ b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1);
+ b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3);
+
+#if CHECK_RESULTS
+ assert(b[0] == c_result[0]);
+ assert(b[1] == c_result[1]);
+#endif // CHECK_RESULTS
}
static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
@@ -401,50 +445,6 @@ static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
}
}
-static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
- const int16_t *dy, int dy_stride,
- const int16_t *dt, int dt_stride,
- int *b) {
- __m128i b0_acc = _mm_setzero_si128();
- __m128i b1_acc = _mm_setzero_si128();
-
- for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
- // Need to load 8 values of dx, 8 of dy, 8 of dt, which conveniently
- // works out to one register each. Then just calculate dx * dt, dy * dt,
- // and (implicitly) sum horizontally in pairs.
- // This gives four 32-bit partial sums for each of b[0] and b[1],
- // which can be accumulated and summed at the end.
- __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * dx_stride]);
- __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * dy_stride]);
- __m128i dt_row = _mm_loadu_si128((__m128i *)&dt[i * dt_stride]);
-
- b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt_row));
- b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt_row));
- }
-
- // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc).
- // We might as well use a `hadd` instruction to do 4 of the additions
- // needed here. Then that just leaves two more additions, which can be
- // done in scalar code
- __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc);
- b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1);
- b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3);
-
-#if CHECK_RESULTS
- int c_result[2] = { 0 };
-
- for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
- for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
- c_result[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
- c_result[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
- }
- }
-
- assert(b[0] == c_result[0]);
- assert(b[1] == c_result[1]);
-#endif // CHECK_RESULTS
-}
-
static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
const int16_t *dy, int dy_stride,
double *M) {
@@ -528,7 +528,6 @@ void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref,
double M[4];
double M_inv[4];
int b[2];
- int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
@@ -541,9 +540,8 @@ void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref,
invert_2x2(M, M_inv);
for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
- compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
- compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
- DISFLOW_PATCH_SIZE, b);
+ compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+ b);
// Solve flow equations to find a better estimate for the flow vector
// at this point
diff --git a/aom_dsp/pyramid.c b/aom_dsp/pyramid.c
index a26d3026c..324a18bae 100644
--- a/aom_dsp/pyramid.c
+++ b/aom_dsp/pyramid.c
@@ -112,7 +112,7 @@ ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
return NULL;
}
- pyr->layers = aom_calloc(n_levels, sizeof(PyramidLayer));
+ pyr->layers = aom_calloc(n_levels, sizeof(*pyr->layers));
if (!pyr->layers) {
aom_free(pyr);
return NULL;
@@ -125,10 +125,10 @@ ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
// These are gathered up first, so that we can allocate all pyramid levels
// in a single buffer
size_t buffer_size = 0;
- size_t *layer_offsets = aom_calloc(n_levels, sizeof(size_t));
+ size_t *layer_offsets = aom_calloc(n_levels, sizeof(*layer_offsets));
if (!layer_offsets) {
- aom_free(pyr);
aom_free(pyr->layers);
+ aom_free(pyr);
return NULL;
}
@@ -195,8 +195,8 @@ ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
pyr->buffer_alloc =
aom_memalign(PYRAMID_ALIGNMENT, buffer_size * sizeof(*pyr->buffer_alloc));
if (!pyr->buffer_alloc) {
- aom_free(pyr);
aom_free(pyr->layers);
+ aom_free(pyr);
aom_free(layer_offsets);
return NULL;
}
@@ -250,7 +250,7 @@ static INLINE void fill_border(uint8_t *img_buf, const int width,
// Compute coarse to fine pyramids for a frame
// This must only be called while holding frame_pyr->mutex
-static INLINE void fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
ImagePyramid *frame_pyr) {
int n_levels = frame_pyr->n_levels;
const int frame_width = frame->y_crop_width;
@@ -312,11 +312,13 @@ static INLINE void fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
// 2) Up/downsampling by a factor of 2 can be implemented much more
// efficiently than up/downsampling by a generic ratio.
// TODO(rachelbarker): Use optimized downsample-by-2 function
- av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
- prev_stride, this_buffer, this_height, this_width,
- this_stride);
+ if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
+ prev_stride, this_buffer, this_height, this_width,
+ this_stride))
+ return false;
fill_border(this_buffer, this_width, this_height, this_stride);
}
+ return true;
}
// Fill out a downsampling pyramid for a given frame.
@@ -331,7 +333,7 @@ static INLINE void fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
//
// However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
// we will still construct the top level.
-void aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
ImagePyramid *pyr) {
assert(pyr);
@@ -344,9 +346,9 @@ void aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
#endif // CONFIG_MULTITHREAD
if (!pyr->valid) {
- fill_pyramid(frame, bit_depth, pyr);
- pyr->valid = true;
+ pyr->valid = fill_pyramid(frame, bit_depth, pyr);
}
+ bool valid = pyr->valid;
// At this point, the pyramid is guaranteed to be valid, and can be safely
// read from without holding the mutex any more
@@ -354,6 +356,7 @@ void aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
#if CONFIG_MULTITHREAD
pthread_mutex_unlock(&pyr->mutex);
#endif // CONFIG_MULTITHREAD
+ return valid;
}
#ifndef NDEBUG
diff --git a/aom_dsp/pyramid.h b/aom_dsp/pyramid.h
index 812aae181..9442a1ff0 100644
--- a/aom_dsp/pyramid.h
+++ b/aom_dsp/pyramid.h
@@ -100,7 +100,7 @@ ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
//
// However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
// we will still construct the top level.
-void aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
ImagePyramid *pyr);
#ifndef NDEBUG
diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index 8dd5b0b0f..e5c960b82 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@@ -11,6 +11,7 @@
#include "aom_dsp/quantize.h"
#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
#if !CONFIG_REALTIME_ONLY
void aom_quantize_b_adaptive_helper_c(
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 341a5ffa8..8d69e3bf1 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -257,32 +257,32 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
}
-#define HIGHBD_SAD_MXNX4D(m, n) \
- void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) { \
- sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
- ref_array[i], ref_stride); \
- } \
- } \
- void aom_highbd_sad_skip_##m##x##n##x4d_c( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) { \
- sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i], \
- 2 * ref_stride, (m), (n / 2)); \
- } \
+#define HIGHBD_SAD_MXNX4D(m, n) \
+ void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
+ } \
+ void aom_highbd_sad_skip_##m##x##n##x4d_c( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i], \
+ 2 * ref_stride, (m), (n / 2)); \
+ } \
}
// Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable.
-#define HIGHBD_SAD_MXNX3D(m, n) \
- void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, \
- sad_array); \
+#define HIGHBD_SAD_MXNX3D(m, n) \
+ void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, \
+ sad_array); \
}
// 128x128
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index 8d07c3440..f38af8036 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -67,22 +67,7 @@ SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
*((uint32_t *)p) = a;
}
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if defined(__clang__)
- vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
- 0);
-#elif defined(__CC_ARM)
- *(__packed uint32_t *)p) = a;
-#elif defined(__GNUC__)
- struct Unaligned32Struct {
- uint32_t value;
- uint8_t dummy; // To make the size non-power-of-two.
- } __attribute__((__packed__));
- ((struct Unaligned32Struct *)p)->value = a;
-#else
- memcpy(p, &a, 4);
-#endif
-}
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { memcpy(p, &a, 4); }
SIMD_INLINE v64 v64_load_aligned(const void *p) {
return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
diff --git a/aom_dsp/sse.c b/aom_dsp/sse.c
index 16f6b58bd..bfe76edc3 100644
--- a/aom_dsp/sse.c
+++ b/aom_dsp/sse.c
@@ -9,7 +9,12 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-/* Sum the difference between every corresponding element of the buffers. */
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 63c1e5fcf..f02c3077a 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -1058,7 +1058,7 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
}
#define HIGHBD_OBMC_VAR(W, H) \
- unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
+ unsigned int aom_highbd_8_obmc_variance##W##x##H##_c( \
const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
const int32_t *mask, unsigned int *sse) { \
int sum; \
@@ -1087,7 +1087,7 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
}
#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
- unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
+ unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c( \
const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
@@ -1098,8 +1098,8 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
aom_highbd_var_filter_block2d_bil_second_pass( \
fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
\
- return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
- wsrc, mask, sse); \
+ return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, wsrc, mask, sse); \
} \
\
unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 582305957..245fda1e9 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -23,32 +23,6 @@
#include "aom_ports/mem.h"
#include "aom_ports/emmintrin_compat.h"
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3,
- 3, 4, 2, 3, 3, 4,
- 4, 5, 5, 6 };
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7,
- 7, 8, 6, 7, 7, 8,
- 8, 9, 9, 10 };
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t,
- filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4,
- 4, 5, 5, 6, 6, 7, 7, 8 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
- filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 7, 7, 8, 8, 9, 9, 10 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
- filt3_global[16]) = { 4, 5, 5, 6, 6, 7, 7, 8,
- 8, 9, 9, 10, 10, 11, 11, 12 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
- filt4_global[16]) = { 6, 7, 7, 8, 8, 9, 9, 10,
- 10, 11, 11, 12, 12, 13, 13, 14 };
-
DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
@@ -64,11 +38,6 @@ DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
};
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-
static void aom_filter_block1d4_h4_ssse3(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -196,74 +165,6 @@ static void aom_filter_block1d4_v4_ssse3(
}
}
-void aom_filter_block1d4_h8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i firstFilters, secondFilters, shuffle1, shuffle2;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, srcReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter into the first lane
- firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
- // duplicate only the third 16 bit in the filter into the first lane
- secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
- // duplicate only the seconds 16 bits in the filter into the second lane
- // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
- firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
- // duplicate only the forth 16 bits in the filter into the second lane
- // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
- secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
- // loading the local filters
- shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
- shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
- srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // extract the higher half of the lane
- srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
- srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
- minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
- // add and saturate all the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr += src_pixels_per_line;
-
- // save only 4 bytes
- *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
-
- output_ptr += output_pitch;
- }
-}
-
static void aom_filter_block1d8_h4_ssse3(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -403,168 +304,6 @@ static void aom_filter_block1d8_v4_ssse3(
}
}
-void aom_filter_block1d8_h8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
- srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
- srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
- // add and saturate all the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr += src_pixels_per_line;
-
- // save only 8 bytes
- _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
- output_ptr += output_pitch;
- }
-}
-
-void aom_filter_block1d8_v8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
- __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
- __m128i srcReg8;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- // load the first 7 rows of 8 bytes
- srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
- srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
- for (i = 0; i < output_height; i++) {
- // load the last 8 bytes
- srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
- // merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
- srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
- // add and saturate the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr += src_pitch;
-
- // shift down a row
- srcReg1 = srcReg2;
- srcReg2 = srcReg3;
- srcReg3 = srcReg4;
- srcReg4 = srcReg5;
- srcReg5 = srcReg6;
- srcReg6 = srcReg7;
- srcReg7 = srcReg8;
-
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
- output_ptr += out_pitch;
- }
-}
-
static void aom_filter_block1d16_h4_ssse3(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index ca2752eab..9ab9143ee 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -25,6 +25,11 @@ static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
*out_hi = _mm_unpackhi_epi16(in, sign_bits);
}
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi32(a, sign);
+}
+
void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
int *min, int *max) {
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -583,21 +588,14 @@ void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
int aom_satd_sse2(const tran_low_t *coeff, int length) {
int i;
const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
__m128i accum = zero;
- for (i = 0; i < length; i += 16) {
- const __m128i src_line0 = load_tran_low(coeff);
- const __m128i src_line1 = load_tran_low(coeff + 8);
- const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
- const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
- const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line)
- const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line)
- const __m128i sum0 = _mm_madd_epi16(abs0, one);
- const __m128i sum1 = _mm_madd_epi16(abs1, one);
- accum = _mm_add_epi32(accum, sum0);
- accum = _mm_add_epi32(accum, sum1);
- coeff += 16;
+ for (i = 0; i < length; i += 4) {
+ const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ const __m128i coeff_sign = _mm_srai_epi32(src_line, 31);
+ const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign);
+ accum = _mm_add_epi32(accum, abs_coeff);
+ coeff += 4;
}
{ // cascading summation of accum
diff --git a/aom_dsp/x86/fft_sse2.c b/aom_dsp/x86/fft_sse2.c
index c6023afab..bdd235bcd 100644
--- a/aom_dsp/x86/fft_sse2.c
+++ b/aom_dsp/x86/fft_sse2.c
@@ -28,6 +28,9 @@ static INLINE void transpose4x4(const float *A, float *B, const int lda,
_mm_store_ps(&B[3 * ldb], row4);
}
+// Referenced by fft_avx2.c.
+void aom_transpose_float_sse2(const float *A, float *B, int n);
+
void aom_transpose_float_sse2(const float *A, float *B, int n) {
for (int y = 0; y < n; y += 4) {
for (int x = 0; x < n; x += 4) {
@@ -36,6 +39,9 @@ void aom_transpose_float_sse2(const float *A, float *B, int n) {
}
}
+// Referenced by fft_avx2.c.
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n);
+
void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
const int n2 = n / 2;
output[0] = packed[0];
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index e11754e59..6c78eeeef 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -604,7 +604,7 @@ static void init_sad(__m256i *s) {
static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
int M, int N, int D, const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
+ const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) {
__m256i sad_vec[4];
const uint16_t *refp[4];
const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
@@ -639,26 +639,26 @@ static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
#define HIGHBD_SAD_MXNX4D_AVX2(m, n) \
void aom_highbd_sad##m##x##n##x4d_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
sad_array); \
}
-#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \
- void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
- 2 * ref_stride, sad_array); \
- sad_array[0] <<= 1; \
- sad_array[1] <<= 1; \
- sad_array[2] <<= 1; \
- sad_array[3] <<= 1; \
+#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \
+ void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
}
#define HIGHBD_SAD_MXNX3D_AVX2(m, n) \
void aom_highbd_sad##m##x##n##x3d_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
sad_array); \
}
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index 36e647383..b4ff91d85 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -729,14 +729,16 @@ VAR_FN(32, 16, 16, 9)
VAR_FN(16, 32, 16, 9)
VAR_FN(16, 16, 16, 8)
VAR_FN(16, 8, 8, 7)
-VAR_FN(16, 4, 16, 6)
-VAR_FN(8, 32, 8, 8)
-VAR_FN(32, 8, 8, 8)
-VAR_FN(16, 64, 16, 10)
-VAR_FN(64, 16, 16, 10)
VAR_FN(8, 16, 8, 7)
VAR_FN(8, 8, 8, 6)
+#if !CONFIG_REALTIME_ONLY
+VAR_FN(16, 64, 16, 10)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(64, 16, 16, 10)
+VAR_FN(8, 32, 8, 8)
+#endif // !CONFIG_REALTIME_ONLY
+
#undef VAR_FN
#define SSE2_HEIGHT(H) \
diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c
index aa73c392d..d3f5f52df 100644
--- a/aom_dsp/x86/obmc_variance_sse4.c
+++ b/aom_dsp/x86/obmc_variance_sse4.c
@@ -258,10 +258,10 @@ static INLINE void hbd_obmc_variance_w8n(
*sse += xx_hsum_epi32_si64(v_sse_d);
}
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- unsigned int *sse, int *sum) {
+static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
int64_t sum64 = 0;
uint64_t sse64 = 0;
if (w == 4) {
@@ -328,11 +328,11 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
}
#define HBD_OBMCVARWXH(W, H) \
- unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \
+ unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1( \
const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
const int32_t *mask, unsigned int *sse) { \
int sum; \
- highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
} \
\
diff --git a/aom_dsp/x86/sse_avx2.c b/aom_dsp/x86/sse_avx2.c
index e6ee2fcab..c5a5f5c23 100644
--- a/aom_dsp/x86/sse_avx2.c
+++ b/aom_dsp/x86/sse_avx2.c
@@ -8,9 +8,11 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+
#include <smmintrin.h>
#include <immintrin.h>
+#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
@@ -85,6 +87,7 @@ static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
*sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
}
+
static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride, __m256i *sum) {
const __m128i v_a0 = xx_loadl_64(a);
@@ -96,6 +99,7 @@ static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
*sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
}
+
int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int width, int height) {
int32_t y = 0;
@@ -249,6 +253,7 @@ static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
*sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
}
+
int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
int b_stride, int width, int height) {
int32_t y = 0;
diff --git a/aom_dsp/x86/sse_sse4.c b/aom_dsp/x86/sse_sse4.c
index 5f95eb9ae..7e74554d7 100644
--- a/aom_dsp/x86/sse_sse4.c
+++ b/aom_dsp/x86/sse_sse4.c
@@ -13,6 +13,7 @@
#include <smmintrin.h>
#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
#include "aom/aom_integer.h"
@@ -62,6 +63,7 @@ static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
*sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
}
+
static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
__m128i *sum) {
const __m128i v_a0 = xx_loadl_64(a);
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index 4d6ee6ad6..b729e5f41 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -62,8 +62,8 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
}
static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
- __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
- __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
+ __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+ __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
return yy_set_m128i(mhi, mlo);
}
diff --git a/aom_ports/aarch32_cpudetect.c b/aom_ports/aarch32_cpudetect.c
new file mode 100644
index 000000000..753f95711
--- /dev/null
+++ b/aom_ports/aarch32_cpudetect.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+// Feature detection code for Armv7-A / AArch32.
+
+#include "arm_cpudetect.h"
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+ // This function should actually be a no-op. There is no way to adjust any of
+ // these because the RTCD tables do not exist: the functions are called
+ // statically.
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ // MSVC has no inline __asm support for Arm, but it does let you __emit
+ // instructions via their assembled hex code.
+ // All of these instructions should be essentially nops.
+ __try {
+ // VORR q0,q0,q0
+ __emit(0xF2200150);
+ flags |= HAS_NEON;
+ } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+ // Ignore exception.
+ }
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ uint64_t features = android_getCpuFeatures();
+ if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define AOM_AARCH32_HWCAP_NEON (1 << 12)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+ unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON
+ if (hwcap & AOM_AARCH32_HWCAP_NEON) flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+#else // end __linux__
+#error \
+ "Runtime CPU detection selected, but no CPU detection method " \
+"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
+#endif
+
+int aom_arm_cpu_caps(void) {
+ int flags = 0;
+ if (arm_cpu_env_flags(&flags)) {
+ return flags;
+ }
+ return arm_get_cpu_caps() & arm_cpu_env_mask();
+}
diff --git a/aom_ports/aarch64_cpudetect.c b/aom_ports/aarch64_cpudetect.c
new file mode 100644
index 000000000..43d5a149c
--- /dev/null
+++ b/aom_ports/aarch64_cpudetect.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "arm_cpudetect.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+ // This function should actually be a no-op. There is no way to adjust any of
+ // these because the RTCD tables do not exist: the functions are called
+ // statically.
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT
+
+// sysctlbyname() parameter documentation for instruction set characteristics:
+// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+static INLINE bool have_feature(const char *feature) {
+ int64_t feature_present = 0;
+ size_t size = sizeof(feature_present);
+ if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+ return false;
+ }
+ return feature_present;
+}
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON;
+#endif // HAVE_NEON
+#if HAVE_ARM_CRC32
+ if (have_feature("hw.optional.armv8_crc32")) flags |= HAS_ARM_CRC32;
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+ if (have_feature("hw.optional.arm.FEAT_DotProd")) flags |= HAS_NEON_DOTPROD;
+#endif // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+ if (have_feature("hw.optional.arm.FEAT_I8MM")) flags |= HAS_NEON_I8MM;
+#endif // HAVE_NEON_I8MM
+ return flags;
+}
+
+#elif defined(_WIN32) // end __APPLE__
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+// IsProcessorFeaturePresent() parameter documentation:
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+#if HAVE_ARM_CRC32
+ if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) {
+ flags |= HAS_ARM_CRC32;
+ }
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 20348, supported by Windows 11 and Windows Server 2022.
+#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+ if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+ flags |= HAS_NEON_DOTPROD;
+ }
+#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+#endif // HAVE_NEON_DOTPROD
+ // No I8MM or SVE feature detection available on Windows at time of writing.
+ return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define AOM_AARCH64_HWCAP_CRC32 (1 << 7)
+#define AOM_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define AOM_AARCH64_HWCAP_SVE (1 << 22)
+#define AOM_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+ unsigned long hwcap = getauxval(AT_HWCAP);
+ unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+#if HAVE_ARM_CRC32
+ if (hwcap & AOM_AARCH64_HWCAP_CRC32) flags |= HAS_ARM_CRC32;
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+ if (hwcap & AOM_AARCH64_HWCAP_ASIMDDP) flags |= HAS_NEON_DOTPROD;
+#endif // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+ if (hwcap2 & AOM_AARCH64_HWCAP2_I8MM) flags |= HAS_NEON_I8MM;
+#endif // HAVE_NEON_I8MM
+#if HAVE_SVE
+ if (hwcap & AOM_AARCH64_HWCAP_SVE) flags |= HAS_SVE;
+#endif // HAVE_SVE
+ return flags;
+}
+
+#elif defined(__Fuchsia__) // end __linux__
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
+#ifndef ZX_ARM64_FEATURE_ISA_I8MM
+#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
+#endif
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083.
+#ifndef ZX_ARM64_FEATURE_ISA_SVE
+#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20))
+#endif
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+ uint32_t features;
+ zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+ if (status != ZX_OK) return flags;
+#if HAVE_ARM_CRC32
+ if (features & ZX_ARM64_FEATURE_ISA_CRC32) flags |= HAS_ARM_CRC32;
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+ if (features & ZX_ARM64_FEATURE_ISA_DP) flags |= HAS_NEON_DOTPROD;
+#endif // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+ if (features & ZX_ARM64_FEATURE_ISA_I8MM) flags |= HAS_NEON_I8MM;
+#endif // HAVE_NEON_I8MM
+#if HAVE_SVE
+ if (features & ZX_ARM64_FEATURE_ISA_SVE) flags |= HAS_SVE;
+#endif // HAVE_SVE
+ return flags;
+}
+
+#else // end __Fuchsia__
+#error \
+ "Runtime CPU detection selected, but no CPU detection method " \
+"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
+#endif
+
+int aom_arm_cpu_caps(void) {
+ int flags = 0;
+ if (!arm_cpu_env_flags(&flags)) {
+ flags = arm_get_cpu_caps() & arm_cpu_env_mask();
+ }
+
+ // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+ if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_NEON_I8MM;
+
+ // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available.
+ if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_SVE;
+ if (!(flags & HAS_NEON_I8MM)) flags &= ~HAS_SVE;
+
+ return flags;
+}
diff --git a/aom_ports/aom_ports.cmake b/aom_ports/aom_ports.cmake
index e3b67e48a..8fd2ffd07 100644
--- a/aom_ports/aom_ports.cmake
+++ b/aom_ports/aom_ports.cmake
@@ -24,8 +24,10 @@ list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")
list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
-list(APPEND AOM_PORTS_SOURCES_ARM "${AOM_ROOT}/aom_ports/arm.h"
- "${AOM_ROOT}/aom_ports/arm_cpudetect.c")
+list(APPEND AOM_PORTS_SOURCES_AARCH32
+ "${AOM_ROOT}/aom_ports/aarch32_cpudetect.c")
+list(APPEND AOM_PORTS_SOURCES_AARCH64
+ "${AOM_ROOT}/aom_ports/aarch64_cpudetect.c")
if(CONFIG_RUNTIME_CPU_DETECT AND ANDROID_NDK)
include_directories(${ANDROID_NDK}/sources/android/cpufeatures)
@@ -57,8 +59,11 @@ function(setup_aom_ports_targets)
elseif(WIN32 AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
set(aom_ports_has_symbols 1)
+ elseif("${AOM_TARGET_CPU}" STREQUAL "arm64")
+ add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH64})
+ set(aom_ports_has_symbols 1)
elseif("${AOM_TARGET_CPU}" MATCHES "arm")
- add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM})
+ add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH32})
set(aom_ports_has_symbols 1)
elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
diff --git a/aom_ports/arm.h b/aom_ports/arm.h
index cb1fb9bec..853741d19 100644
--- a/aom_ports/arm.h
+++ b/aom_ports/arm.h
@@ -19,12 +19,16 @@
extern "C" {
#endif
-/*ARMv5TE "Enhanced DSP" instructions.*/
-#define HAS_EDSP 0x01
-/*ARMv6 "Parallel" or "Media" instructions.*/
-#define HAS_MEDIA 0x02
-/*ARMv7 optional NEON instructions.*/
-#define HAS_NEON 0x04
+// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
+#define HAS_NEON (1 << 0)
+// Armv8.0-A optional CRC32 instructions, mandatory from Armv8.1-A.
+#define HAS_ARM_CRC32 (1 << 1)
+// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
+#define HAS_NEON_DOTPROD (1 << 2)
+// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
+#define HAS_NEON_I8MM (1 << 3)
+// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
+#define HAS_SVE (1 << 4)
int aom_arm_cpu_caps(void);
diff --git a/aom_ports/arm_cpudetect.c b/aom_ports/arm_cpudetect.c
deleted file mode 100644
index 276ef6182..000000000
--- a/aom_ports/arm_cpudetect.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include "aom_ports/arm.h"
-#include "config/aom_config.h"
-
-#ifdef WINAPI_FAMILY
-#include <winapifamily.h>
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define getenv(x) NULL
-#endif
-#endif
-
-static int arm_cpu_env_flags(int *flags) {
- char *env;
- env = getenv("AOM_SIMD_CAPS");
- if (env && *env) {
- *flags = (int)strtol(env, NULL, 0);
- return 0;
- }
- *flags = 0;
- return -1;
-}
-
-static int arm_cpu_env_mask(void) {
- char *env;
- env = getenv("AOM_SIMD_CAPS_MASK");
- return env && *env ? (int)strtol(env, NULL, 0) : ~0;
-}
-
-#if !CONFIG_RUNTIME_CPU_DETECT || defined(__APPLE__)
-
-int aom_arm_cpu_caps(void) {
- /* This function should actually be a no-op. There is no way to adjust any of
- * these because the RTCD tables do not exist: the functions are called
- * statically */
- int flags;
- int mask;
- if (!arm_cpu_env_flags(&flags)) {
- return flags;
- }
- mask = arm_cpu_env_mask();
-#if HAVE_NEON
- flags |= HAS_NEON;
-#endif /* HAVE_NEON */
- return flags & mask;
-}
-
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT || __APPLE__ */
-#if HAVE_NEON && !AOM_ARCH_AARCH64
-/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
-#undef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#undef WIN32_EXTRA_LEAN
-#define WIN32_EXTRA_LEAN
-#include <windows.h>
-#endif // HAVE_NEON && !AOM_ARCH_AARCH64
-
-int aom_arm_cpu_caps(void) {
- int flags;
- int mask;
- if (!arm_cpu_env_flags(&flags)) {
- return flags;
- }
- mask = arm_cpu_env_mask();
-#if AOM_ARCH_AARCH64
- return HAS_NEON & mask;
-#else
-/* MSVC has no inline __asm support for ARM, but it does let you __emit
- * instructions via their assembled hex code.
- * All of these instructions should be essentially nops.
- */
-#if HAVE_NEON
- if (mask & HAS_NEON) {
- __try {
- /*VORR q0,q0,q0*/
- __emit(0xF2200150);
- flags |= HAS_NEON;
- } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
- /*Ignore exception.*/
- }
- }
-#endif /* HAVE_NEON */
- return flags & mask;
-#endif // AOM_ARCH_AARCH64
-}
-
-#elif defined(__ANDROID__) /* end _MSC_VER */
-#include <cpu-features.h>
-
-int aom_arm_cpu_caps(void) {
- int flags;
- int mask;
- uint64_t features;
- if (!arm_cpu_env_flags(&flags)) {
- return flags;
- }
- mask = arm_cpu_env_mask();
- features = android_getCpuFeatures();
-
-#if HAVE_NEON
- if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
-#endif /* HAVE_NEON */
- return flags & mask;
-}
-
-#elif defined(__linux__) /* end __ANDROID__ */
-
-#include <stdio.h>
-
-int aom_arm_cpu_caps(void) {
- FILE *fin;
- int flags;
- int mask;
- if (!arm_cpu_env_flags(&flags)) {
- return flags;
- }
- mask = arm_cpu_env_mask();
- /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
- * on Android.
- * This also means that detection will fail in Scratchbox.
- */
- fin = fopen("/proc/cpuinfo", "r");
- if (fin != NULL) {
- /* 512 should be enough for anybody (it's even enough for all the flags
- * that x86 has accumulated... so far).
- */
- char buf[512];
- while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_NEON
- if (memcmp(buf, "Features", 8) == 0) {
- char *p;
- p = strstr(buf, " neon");
- if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
- flags |= HAS_NEON;
- }
- }
-#endif /* HAVE_NEON */
- }
- fclose(fin);
- }
- return flags & mask;
-}
-#else /* end __linux__ */
-#error \
- "Runtime CPU detection selected, but no CPU detection method " \
-"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
-#endif
diff --git a/aom_ports/arm_cpudetect.h b/aom_ports/arm_cpudetect.h
new file mode 100644
index 000000000..33c2d1bb6
--- /dev/null
+++ b/aom_ports/arm_cpudetect.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/arm.h"
+#include "config/aom_config.h"
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(_WIN32)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#undef WIN32_EXTRA_LEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
+#define ANDROID_USE_CPU_FEATURES_LIB 1
+// Use getauxval() when targeting (64-bit) Android with API level >= 18.
+// getauxval() is supported since Android API level 18 (Android 4.3.)
+// First Android version with 64-bit support was Android 5.x (API level 21).
+#include <cpu-features.h>
+#endif
+
+static bool arm_cpu_env_flags(int *flags) {
+ const char *env = getenv("AOM_SIMD_CAPS");
+ if (env && *env) {
+ *flags = (int)strtol(env, NULL, 0);
+ return true;
+ }
+ return false;
+}
+
+static int arm_cpu_env_mask(void) {
+ const char *env = getenv("AOM_SIMD_CAPS_MASK");
+ return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 3c5b992bd..7f4c165f5 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -13,7 +13,6 @@
#define AOM_AOM_PORTS_BITOPS_H_
#include <assert.h>
-#include <stdint.h>
#include "aom_ports/msvc.h"
#include "config/aom_config.h"
@@ -34,12 +33,8 @@ extern "C" {
// These versions of get_msb() are only valid when n != 0 because all
// of the optimized versions are undefined when n == 0:
-// get_byteswap64:
-// Returns the number (uint64_t) with byte-positions reversed
-// e.g. input 0x123456789ABCDEF0 returns 0xF0DEBC9A78563412
-
// GCC compiler: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
-// MSVC: https://learn.microsoft.com/en-us/cpp/c-runtime-library/
+// MSVC: https://learn.microsoft.com/en-us/cpp/intrinsics/compiler-intrinsics
// use GNU builtins where available.
#if defined(__GNUC__) && \
@@ -48,10 +43,6 @@ static INLINE int get_msb(unsigned int n) {
assert(n != 0);
return 31 ^ __builtin_clz(n);
}
-
-static INLINE uint64_t get_byteswap64(uint64_t num) {
- return __builtin_bswap64(num);
-}
#elif defined(USE_MSC_INTRINSICS)
#pragma intrinsic(_BitScanReverse)
@@ -61,10 +52,6 @@ static INLINE int get_msb(unsigned int n) {
_BitScanReverse(&first_set_bit, n);
return first_set_bit;
}
-
-static INLINE uint64_t get_byteswap64(uint64_t num) {
- return _byteswap_uint64(num);
-}
#undef USE_MSC_INTRINSICS
#else
static INLINE int get_msb(unsigned int n) {
@@ -82,26 +69,6 @@ static INLINE int get_msb(unsigned int n) {
}
return log;
}
-
-static INLINE uint64_t get_byteswap64(uint64_t num) {
- uint64_t out = 0x00;
- uint64_t mask = 0xFF00000000000000;
- int bit_shift = 56; // 7 bytes
- // 4 ms bytes
- do {
- out |= (num & mask) >> bit_shift;
- mask >>= 8;
- bit_shift -= 16;
- } while (bit_shift >= 0);
- // 4 ls bytes
- bit_shift = 8; // 1 byte
- do {
- out |= (num & mask) << bit_shift;
- mask >>= 8;
- bit_shift += 16;
- } while (bit_shift <= 56);
- return out;
-}
#endif
#ifdef __cplusplus
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index e39684202..a70ce825b 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -24,16 +24,6 @@
#define DECLARE_ALIGNED(n, typ, val) typ val
#endif
-/* Indicates that the usage of the specified variable has been audited to assure
- * that it's safe to use uninitialized. Silences 'may be used uninitialized'
- * warnings on gcc.
- */
-#if defined(__GNUC__) && __GNUC__
-#define UNINITIALIZED_IS_SAFE(x) x = x
-#else
-#define UNINITIALIZED_IS_SAFE(x) x
-#endif
-
#if HAVE_NEON && defined(_MSC_VER)
#define __builtin_prefetch(x)
#endif
diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c
index 82376f4df..94b400b9e 100644
--- a/aom_scale/generic/yv12config.c
+++ b/aom_scale/generic/yv12config.c
@@ -193,7 +193,9 @@ static int realloc_frame_buffer_aligned(
if (num_pyramid_levels > 0) {
ybf->y_pyramid = aom_alloc_pyramid(width, height, num_pyramid_levels,
use_highbitdepth);
+ if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR;
ybf->corners = av1_alloc_corner_list();
+ if (!ybf->corners) return AOM_CODEC_MEM_ERROR;
}
#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
diff --git a/aom_util/aom_thread.c b/aom_util/aom_thread.c
index 2c62b24ae..fa3b0a25e 100644
--- a/aom_util/aom_thread.c
+++ b/aom_util/aom_thread.c
@@ -24,6 +24,7 @@
#include <string.h> // for memset()
#include "aom_mem/aom_mem.h"
+#include "aom_ports/sanitizer.h"
#include "aom_util/aom_thread.h"
#if CONFIG_MULTITHREAD
@@ -144,11 +145,30 @@ static int reset(AVxWorker *const worker) {
pthread_mutex_destroy(&worker->impl_->mutex_);
goto Error;
}
+ pthread_attr_t attr;
+ if (pthread_attr_init(&attr)) goto Error2;
+ // Debug ASan builds require at least ~1MiB of stack; prevents
+ // failures on macOS arm64 where the default is 512KiB.
+ // See: https://crbug.com/aomedia/3379
+#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
+ !defined(NDEBUG)
+ size_t stacksize;
+ if (!pthread_attr_getstacksize(&attr, &stacksize)) {
+ const size_t kMinStackSize = 1 << 20; // 1 MiB
+ if (stacksize < kMinStackSize &&
+ pthread_attr_setstacksize(&attr, kMinStackSize)) {
+ pthread_attr_destroy(&attr);
+ goto Error2;
+ }
+ }
+#endif
pthread_mutex_lock(&worker->impl_->mutex_);
- ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+ ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
if (ok) worker->status_ = OK;
pthread_mutex_unlock(&worker->impl_->mutex_);
+ pthread_attr_destroy(&attr);
if (!ok) {
+ Error2:
pthread_mutex_destroy(&worker->impl_->mutex_);
pthread_cond_destroy(&worker->impl_->condition_);
Error:
diff --git a/aom_util/aom_thread.h b/aom_util/aom_thread.h
index 2df190f90..ec2ea4349 100644
--- a/aom_util/aom_thread.h
+++ b/aom_util/aom_thread.h
@@ -37,6 +37,7 @@ extern "C" {
#include <process.h> // NOLINT
#include <windows.h> // NOLINT
typedef HANDLE pthread_t;
+typedef int pthread_attr_t;
typedef CRITICAL_SECTION pthread_mutex_t;
#if _WIN32_WINNT < 0x0600
@@ -60,7 +61,18 @@ typedef CONDITION_VARIABLE pthread_cond_t;
#define THREADFN unsigned int __stdcall
#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+static INLINE int pthread_attr_init(pthread_attr_t *attr) {
+ (void)attr;
+ return 0;
+}
+
+static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
+ (void)attr;
+ return 0;
+}
+
+static INLINE int pthread_create(pthread_t *const thread,
+ const pthread_attr_t *attr,
unsigned int(__stdcall *start)(void *),
void *arg) {
(void)attr;
diff --git a/apps/aomenc.c b/apps/aomenc.c
index 09306f2cd..c3f5c33e6 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -74,7 +74,10 @@ static AOM_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv(
if (detail) fprintf(stderr, " %s\n", detail);
- if (fatal) exit(EXIT_FAILURE);
+ if (fatal) {
+ aom_codec_destroy(ctx);
+ exit(EXIT_FAILURE);
+ }
}
}
diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index 35a2ab4cf..057565411 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c
@@ -303,7 +303,7 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"),
#if CONFIG_AV1_ENCODER
.cpu_used_av1 = ARG_DEF(NULL, "cpu-used", 1,
- "Speed setting (0..6 in good mode, 5..10 in realtime "
+ "Speed setting (0..6 in good mode, 5..11 in realtime "
"mode, 0..9 in all intra mode)"),
.rowmtarg =
ARG_DEF(NULL, "row-mt", 1,
diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index b9d0cfe46..73c78caec 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h
@@ -21,7 +21,6 @@ extern "C" {
#include "common/webmenc.h"
#endif
#include "aom/aomcx.h"
-#include "aom_dsp/flow_estimation/flow_estimation.h"
enum TestDecodeFatality {
TEST_DECODE_OFF,
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 43b7665bb..1bb0539fc 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -267,7 +267,6 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
"${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
"${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
- "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c"
"${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
@@ -319,7 +318,8 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
"${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
"${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
-list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c"
+ "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h")
list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3
"${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c")
@@ -347,26 +347,30 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
- "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
+ "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
- "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/picksrt_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c"
- "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c")
+ "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
- "${AOM_ROOT}/av1/encoder/arm/crc32/hash_crc32.c")
+ "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c")
list(APPEND AOM_AV1_COMMON_INTRIN_NEON
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
@@ -376,10 +380,10 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
"${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
"${AOM_ROOT}/av1/common/arm/cdef_block_neon.c"
"${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+ "${AOM_ROOT}/av1/common/arm/compound_convolve_neon.c"
"${AOM_ROOT}/av1/common/arm/convolve_neon.c"
"${AOM_ROOT}/av1/common/arm/convolve_neon.h"
"${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c"
- "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
"${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
"${AOM_ROOT}/av1/common/arm/reconintra_neon.c"
"${AOM_ROOT}/av1/common/arm/resize_neon.c"
@@ -387,6 +391,18 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
"${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
"${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c")
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM
+ "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_i8mm.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon_i8mm.c"
+ "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SVE
+ "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")
+
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
"${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
@@ -446,7 +462,14 @@ if(CONFIG_AV1_HIGHBITDEPTH)
"${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c")
list(APPEND AOM_AV1_COMMON_INTRIN_NEON
- "${AOM_ROOT}/av1/common/arm/highbd_convolve_neon.c")
+ "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_convolve_horiz_rs_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_convolve_scale_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_reconinter_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_reconintra_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
"${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
@@ -459,6 +482,11 @@ if(CONFIG_AV1_HIGHBITDEPTH)
"${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
+
+ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
endif()
if(CONFIG_ACCOUNTING)
@@ -623,31 +651,45 @@ function(setup_av1_targets)
endif()
if(HAVE_NEON)
- if(AOM_AV1_COMMON_INTRIN_NEON)
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+ "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON")
+ if(CONFIG_AV1_ENCODER)
add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
- "aom_av1_common"
- "AOM_AV1_COMMON_INTRIN_NEON")
+ "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_NEON")
endif()
+ endif()
+ if(HAVE_ARM_CRC32)
if(CONFIG_AV1_ENCODER)
- if(AOM_AV1_ENCODER_INTRIN_NEON)
- add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
- "aom_av1_encoder"
- "AOM_AV1_ENCODER_INTRIN_NEON")
- endif()
+ add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "arm_crc32"
+ "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_ARM_CRC32")
endif()
+ endif()
- if(HAVE_ARM_CRC32)
- if(CONFIG_AV1_ENCODER)
- if(AOM_AV1_ENCODER_INTRIN_ARM_CRC32)
- add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "crc32"
- "aom_av1_encoder"
- "AOM_AV1_ENCODER_INTRIN_ARM_CRC32")
- endif()
- endif()
+ if(HAVE_NEON_DOTPROD)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_NEON_DOTPROD")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD")
endif()
endif()
+ if(HAVE_NEON_I8MM)
+ add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+ "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_NEON_I8MM")
+ endif()
+
+ if(HAVE_SVE)
+ add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SVE")
+ endif()
+
if(HAVE_VSX)
if(AOM_AV1_COMMON_INTRIN_VSX)
add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 182a09816..1175a32ef 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -8,6 +8,7 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <limits.h>
#include <stdlib.h>
#include <string.h>
@@ -543,6 +544,7 @@ struct aom_codec_alg_priv {
// Number of stats buffers required for look ahead
int num_lap_buffers;
STATS_BUFFER_CTX stats_buf_context;
+ bool monochrome_on_init;
};
static INLINE int gcd(int64_t a, int b) {
@@ -644,6 +646,18 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
if (cfg->g_forced_max_frame_height) {
RANGE_CHECK_HI(cfg, g_h, cfg->g_forced_max_frame_height);
}
+ // To avoid integer overflows when multiplying width by height (or values
+ // derived from width and height) using the int type, impose a maximum frame
+ // area (width * height) of 2^30.
+ const unsigned int max_frame_width =
+ cfg->g_forced_max_frame_width ? cfg->g_forced_max_frame_width : cfg->g_w;
+ const unsigned int max_frame_height = cfg->g_forced_max_frame_height
+ ? cfg->g_forced_max_frame_height
+ : cfg->g_h;
+ const int64_t max_frame_area = (int64_t)max_frame_width * max_frame_height;
+ if (max_frame_area > (1 << 30)) {
+ ERROR("max_frame_area out of range [..2^30]");
+ }
RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
@@ -668,11 +682,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_THIRD_PASS);
- if (cfg->g_pass == AOM_RC_ONE_PASS) {
- RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_TOTAL_BUFFERS);
- } else {
- RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
- }
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
if (cfg->g_usage == AOM_USAGE_ALL_INTRA) {
RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
RANGE_CHECK_HI(cfg, kf_max_dist, 0);
@@ -1489,20 +1499,33 @@ AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg) {
static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
const aom_codec_enc_cfg_t *cfg) {
- InitialDimensions *const initial_dimensions =
- &ctx->ppi->cpi->initial_dimensions;
aom_codec_err_t res;
int force_key = 0;
if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
ERROR("Cannot change width or height after initialization");
- if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
- (initial_dimensions->width &&
- (int)cfg->g_w > initial_dimensions->width) ||
- (initial_dimensions->height &&
- (int)cfg->g_h > initial_dimensions->height))
+ // Note: function encoder_set_config() is allowed to be called multiple
+ // times. However, when the original frame width or height is less than two
+ // times of the new frame width or height, a forced key frame should be
+ // used. To make sure the correct detection of a forced key frame, we need
+ // to update the frame width and height only when the actual encoding is
+ // performed. cpi->last_coded_width and cpi->last_coded_height are used to
+ // track the actual coded frame size.
+ if (ctx->ppi->cpi->last_coded_width && ctx->ppi->cpi->last_coded_height &&
+ (!valid_ref_frame_size(ctx->ppi->cpi->last_coded_width,
+ ctx->ppi->cpi->last_coded_height, cfg->g_w,
+ cfg->g_h) ||
+ ((int)cfg->g_w > ctx->ppi->cpi->last_coded_width) ||
+ ((int)cfg->g_h > ctx->ppi->cpi->last_coded_height))) {
force_key = 1;
+ }
+ }
+
+ if (ctx->monochrome_on_init && cfg->monochrome == 0) {
+ // TODO(aomedia:3465): Allow this case to work without requiring re-init
+ // of encoder.
+ ERROR("Cannot change to monochrome = 0 after init with monochrome");
}
// Prevent increasing lag_in_frames. This check is stricter than it needs
@@ -2569,6 +2592,17 @@ static aom_codec_err_t ctrl_set_bitrate_one_pass_cbr(aom_codec_alg_priv_t *ctx,
return AOM_CODEC_OK;
}
+static aom_codec_err_t ctrl_set_max_consec_frame_drop_cbr(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ AV1_PRIMARY *const ppi = ctx->ppi;
+ AV1_COMP *const cpi = ppi->cpi;
+ const int max_consec_drop = CAST(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, args);
+ if (max_consec_drop < 0) return AOM_CODEC_INVALID_PARAM;
+ cpi->rc.max_consec_drop = max_consec_drop;
+ cpi->rc.drop_count_consec = 0;
+ return AOM_CODEC_OK;
+}
+
#if !CONFIG_REALTIME_ONLY
aom_codec_err_t av1_create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
STATS_BUFFER_CTX *stats_buf_context,
@@ -2722,6 +2756,8 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
priv->oxcf.use_highbitdepth =
(ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+ priv->monochrome_on_init = priv->cfg.monochrome;
+
priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head,
*num_lap_buffers, &priv->oxcf);
if (!priv->ppi) return AOM_CODEC_MEM_ERROR;
@@ -2895,6 +2931,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
AV1_COMP *cpi_lap = ppi->cpi_lap;
if (ppi->cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+ ppi->cpi->last_coded_width = ppi->cpi->oxcf.frm_dim_cfg.width;
+ ppi->cpi->last_coded_height = ppi->cpi->oxcf.frm_dim_cfg.height;
+
if (ppi->lap_enabled && cpi_lap == NULL &&
ppi->cpi->oxcf.pass == AOM_RC_ONE_PASS)
return AOM_CODEC_INVALID_PARAM;
@@ -3094,12 +3133,25 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
av1_compute_num_workers_for_mt(cpi);
num_workers = av1_get_max_num_workers(cpi);
}
- if ((num_workers > 1) && (ppi->p_mt_info.num_workers == 0)) {
+ if (num_workers > 1 && ppi->p_mt_info.num_workers < num_workers) {
// Obtain the maximum no. of frames that can be supported in a parallel
// encode set.
if (is_stat_consumption_stage(cpi)) {
ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf);
}
+ if (ppi->p_mt_info.num_workers > 0) {
+ av1_terminate_workers(ppi);
+ free_thread_data(ppi);
+ aom_free(ppi->p_mt_info.tile_thr_data);
+ ppi->p_mt_info.tile_thr_data = NULL;
+ aom_free(ppi->p_mt_info.workers);
+ ppi->p_mt_info.workers = NULL;
+ ppi->p_mt_info.num_workers = 0;
+ for (int j = 0; j < ppi->num_fp_contexts; j++) {
+ aom_free(ppi->parallel_cpi[j]->td.tctx);
+ ppi->parallel_cpi[j]->td.tctx = NULL;
+ }
+ }
av1_create_workers(ppi, num_workers);
av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS);
}
@@ -3144,7 +3196,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data);
if (status != -1) {
if (status != AOM_CODEC_OK) {
- aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+ aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s",
+ cpi->common.error->detail);
}
}
av1_post_encode_updates(cpi_lap, &cpi_lap_data);
@@ -3157,12 +3210,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf);
}
- // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH for
- // real time encoding.
- if (is_one_pass_rt_params(cpi) &&
- cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH)
- cpi->gf_frame_index = 0;
-
// Get the next visible frame. Invisible frames get packed with the next
// visible frame.
while (cpi_data.cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) {
@@ -3200,7 +3247,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
}
if (status == -1) break;
if (status != AOM_CODEC_OK) {
- aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+ aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s",
+ cpi->common.error->detail);
}
if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) {
av1_init_sc_decisions(ppi);
@@ -3557,7 +3605,12 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
lc->min_q = params->min_quantizers[layer];
lc->scaling_factor_num = params->scaling_factor_num[sl];
lc->scaling_factor_den = params->scaling_factor_den[sl];
- lc->layer_target_bitrate = 1000 * params->layer_target_bitrate[layer];
+ const int layer_target_bitrate = params->layer_target_bitrate[layer];
+ if (layer_target_bitrate > INT_MAX / 1000) {
+ lc->layer_target_bitrate = INT_MAX;
+ } else {
+ lc->layer_target_bitrate = 1000 * layer_target_bitrate;
+ }
lc->framerate_factor = params->framerate_factor[tl];
if (tl == ppi->number_temporal_layers - 1)
target_bandwidth += lc->layer_target_bitrate;
@@ -4344,6 +4397,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_SET_RTC_EXTERNAL_RC, ctrl_set_rtc_external_rc },
{ AV1E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
{ AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr },
+ { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr },
// Getters
{ AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index a1e75589d..29c63e24b 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -144,6 +144,7 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
aom_free(ctx->frame_worker);
aom_free(ctx->buffer_pool);
+ assert(!ctx->img.self_allocd);
aom_img_free(&ctx->img);
aom_free(ctx);
return AOM_CODEC_OK;
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index 6e95f704a..5e6ffc99b 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -288,11 +288,9 @@ void av1_alloc_cdef_buffers(AV1_COMMON *const cm,
cdef_info->allocated_mi_rows);
}
-// Assumes cm->rst_info[p].restoration_unit_size is already initialized
+// Allocate buffers which are independent of restoration_unit_size
void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) {
const int num_planes = av1_num_planes(cm);
- for (int p = 0; p < num_planes; ++p)
- av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
if (cm->rst_tmpbuf == NULL && is_sgr_enabled) {
CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
@@ -303,21 +301,13 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) {
CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
}
- // For striped loop restoration, we divide each row of tiles into "stripes",
+ // For striped loop restoration, we divide each plane into "stripes",
// of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
// luma pixels to match the output from CDEF. We will need to store 2 *
- // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
- // able to quickly answer the question "Where is the <n>'th stripe for tile
- // row <m>?" To make that efficient, we generate the rst_last_stripe array.
- int num_stripes = 0;
- for (int i = 0; i < cm->tiles.rows; ++i) {
- TileInfo tile_info;
- av1_tile_set_row(&tile_info, cm, i);
- const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
- const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
- const int tile_stripes = (ext_h + 63) / 64;
- num_stripes += tile_stripes;
- }
+ // RESTORATION_CTX_VERT lines of data for each stripe.
+ int mi_h = cm->mi_params.mi_rows;
+ const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+ const int num_stripes = (ext_h + 63) / 64;
// Now we need to allocate enough space to store the line buffers for the
// stripes
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index 8afcd1fb5..09e5166b1 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -3606,7 +3606,7 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
-shift[0]);
for (int j = 0; j < buf_size_w_div8; ++j) {
- transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
}
temp_b += 8;
}
@@ -3665,14 +3665,14 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
if (lr_flip == 1) {
for (int j = 0; j < buf_size_w_div8; ++j) {
flip_buf_ud_neon(&cur_a[j * 8], 8);
- transpose_s16_8x8q(
+ transpose_arrays_s16_8x8(
&cur_a[j * 8],
&b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
}
temp_b += 8;
} else {
for (int j = 0; j < buf_size_w_div8; ++j) {
- transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
}
temp_b += 8;
}
@@ -3730,7 +3730,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
-shift[0]);
for (int j = 0; j < buf_size_w_div8; ++j) {
- transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
}
temp_b += 8;
}
@@ -3768,7 +3768,7 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
int32_t *buf = temp_out + buf_offset;
int32_t *buf_ptr = buf;
const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
- int r, bd = 8;
+ int r;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
const transform_1d_neon col_txfm =
@@ -3795,20 +3795,20 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
- clamp_buf(temp_in, txfm_size_row, bd + 8);
+ clamp_buf(temp_in, txfm_size_row, 16);
col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (ud_flip == 0) {
for (r = 0; r < txfm_size_row; ++r) {
output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ clip_pixel(output[r * stride + c] + temp_out[r]);
}
} else {
// flip upside down
for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
}
}
}
@@ -3832,7 +3832,7 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
int32_t *buf_ptr = buf;
const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
16, 16, 16, 16 };
- int r, bd = 8;
+ int r;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
const transform_1d_neon col_txfm =
@@ -3860,20 +3860,20 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
- clamp_buf(temp_in, txfm_size_row, bd + 8);
+ clamp_buf(temp_in, txfm_size_row, 16);
col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (ud_flip == 0) {
for (r = 0; r < txfm_size_row; ++r) {
output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ clip_pixel(output[r * stride + c] + temp_out[r]);
}
} else {
// flip upside down
for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
}
}
}
@@ -3897,7 +3897,7 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
int32_t *buf_ptr = buf;
const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
16, 16, 16, 16 };
- int r, bd = 8;
+ int r;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
const transform_1d_neon col_txfm =
@@ -3925,20 +3925,20 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
- clamp_buf(temp_in, txfm_size_row, bd + 8);
+ clamp_buf(temp_in, txfm_size_row, 16);
col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (ud_flip == 0) {
for (r = 0; r < txfm_size_row; ++r) {
output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ clip_pixel(output[r * stride + c] + temp_out[r]);
}
} else {
// flip upside down
for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
}
}
}
@@ -3962,7 +3962,7 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
int32_t *buf_ptr = buf;
const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
16, 16, 16, 16, 16 };
- int r, bd = 8;
+ int r;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
const transform_1d_neon col_txfm =
@@ -3989,20 +3989,20 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
- clamp_buf(temp_in, txfm_size_row, bd + 8);
+ clamp_buf(temp_in, txfm_size_row, 16);
col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (ud_flip == 0) {
for (r = 0; r < txfm_size_row; ++r) {
output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ clip_pixel(output[r * stride + c] + temp_out[r]);
}
} else {
// flip upside down
for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
}
}
}
@@ -4026,7 +4026,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
int32_t *buf_ptr = buf;
const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
16, 16, 16, 16, 16 };
- int r, bd = 8;
+ int r;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
const transform_1d_neon col_txfm =
@@ -4053,20 +4053,20 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
- clamp_buf(temp_in, txfm_size_row, bd + 8);
+ clamp_buf(temp_in, txfm_size_row, 16);
col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (ud_flip == 0) {
for (r = 0; r < txfm_size_row; ++r) {
output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ clip_pixel(output[r * stride + c] + temp_out[r]);
}
} else {
// flip upside down
for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
}
}
}
@@ -4116,14 +4116,14 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
if (lr_flip == 1) {
for (int j = 0; j < buf_size_w_div8; ++j) {
flip_buf_ud_neon(&cur_a[j * 8], 8);
- transpose_s16_8x8q(
+ transpose_arrays_s16_8x8(
&cur_a[j * 8],
&b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
}
temp_b += 8;
} else {
for (int j = 0; j < buf_size_w_div8; ++j) {
- transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
}
temp_b += 8;
}
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index baad3287d..22d297721 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -13,12 +13,12 @@
#include <arm_neon.h>
#include <assert.h>
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/blend_neon.h"
#include "aom_dsp/arm/mem_neon.h"
-#include "aom_ports/mem.h"
-#include "config/aom_dsp_rtcd.h"
void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
@@ -31,94 +31,72 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
assert(w >= 2);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- uint8x8_t tmp0, tmp1;
- uint8x16_t res_q;
- uint16x8_t res, res_low, res_high;
- const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
-
- if (w >= 16) {
- const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64);
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- __builtin_prefetch(src0);
- __builtin_prefetch(src1);
- const uint8x16_t tmp0_q = vld1q_u8(src0);
- const uint8x16_t tmp1_q = vld1q_u8(src1);
- const uint8x16_t m_q = vld1q_u8(mask);
- const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q);
- res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q));
- res_low =
- vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q));
- res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q));
- res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q),
- vget_high_u8(tmp1_q));
- res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
- vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
- vst1q_u8(dst, res_q);
- src0 += 16;
- src1 += 16;
- dst += 16;
- mask += 16;
- }
- src0 += src0_stride - w;
- src1 += src1_stride - w;
- dst += dst_stride - w;
- mask -= w;
- }
+
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else if (w == 8) {
- const uint8x8_t m = vld1_u8(mask);
- const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
- for (int i = 0; i < h; ++i) {
- __builtin_prefetch(src0);
- __builtin_prefetch(src1);
- tmp0 = vld1_u8(src0);
- tmp1 = vld1_u8(src1);
- res = vmull_u8(m, tmp0);
- res = vmlal_u8(res, max_minus_m, tmp1);
- vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+ const uint8x8_t m0 = vld1_u8(mask);
+ do {
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
src0 += src0_stride;
src1 += src1_stride;
dst += dst_stride;
- }
+ } while (--h != 0);
} else if (w == 4) {
- assert(((uintptr_t)mask & 3) == 0);
- const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
- const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
- for (int i = 0; i < h; i += 2) {
- __builtin_prefetch(src0 + 0 * src0_stride);
- __builtin_prefetch(src0 + 1 * src0_stride);
- __builtin_prefetch(src1 + 0 * src1_stride);
- __builtin_prefetch(src1 + 1 * src1_stride);
- tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
- tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
- res = vmull_u8(m, tmp0);
- res = vmlal_u8(res, max_minus_m, tmp1);
- const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
- store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
- store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
- src0 += (2 * src0_stride);
- src1 += (2 * src1_stride);
- dst += (2 * dst_stride);
- }
- } else if (w == 2) {
- assert(((uintptr_t)mask & 1) == 0);
- const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
- const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
- for (int i = 0; i < h; i += 2) {
- __builtin_prefetch(src0 + 0 * src0_stride);
- __builtin_prefetch(src0 + 1 * src0_stride);
- __builtin_prefetch(src1 + 0 * src1_stride);
- __builtin_prefetch(src1 + 1 * src1_stride);
- tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
- tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
- res = vmull_u8(m, tmp0);
- res = vmlal_u8(res, max_minus_m, tmp1);
- const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
- store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
- store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
- src0 += (2 * src0_stride);
- src1 += (2 * src1_stride);
- dst += (2 * dst_stride);
- }
+ const uint8x8_t m0 = load_unaligned_dup_u8_4x2(mask);
+ do {
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 16) {
+ const uint8x8_t m0 = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+ do {
+ uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_unaligned_u8_2x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_blend_a64_hmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, w, h);
}
}
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index c316977d8..d53d363fc 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -16,6 +16,7 @@
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/blend_neon.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_ports/mem.h"
#include "config/aom_dsp_rtcd.h"
@@ -24,9 +25,6 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, int w, int h) {
- uint8x8_t tmp0, tmp1;
- uint8x16_t tmp0_q, tmp1_q, res_q;
- uint16x8_t res, res_low, res_high;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
@@ -35,95 +33,80 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- if (w >= 16) {
- for (int i = 0; i < h; ++i) {
- const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
- const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
- for (int j = 0; j < w; j += 16) {
- __builtin_prefetch(src0);
- __builtin_prefetch(src1);
- tmp0_q = vld1q_u8(src0);
- tmp1_q = vld1q_u8(src1);
- res_low = vmull_u8(m, vget_low_u8(tmp0_q));
- res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q));
- res_high = vmull_u8(m, vget_high_u8(tmp0_q));
- res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q));
- res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
- vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
- vst1q_u8(dst, res_q);
- src0 += 16;
- src1 += 16;
- dst += 16;
- }
- src0 += src0_stride - w;
- src1 += src1_stride - w;
- dst += dst_stride - w;
- }
+ if (w > 8) {
+ do {
+ uint8x16_t m0 = vdupq_n_u8(mask[0]);
+ int i = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 1;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else if (w == 8) {
- for (int i = 0; i < h; ++i) {
- __builtin_prefetch(src0);
- __builtin_prefetch(src1);
- const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
- const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
- tmp0 = vld1_u8(src0);
- tmp1 = vld1_u8(src1);
- res = vmull_u8(m, tmp0);
- res = vmlal_u8(res, max_minus_m, tmp1);
- vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+ do {
+ uint8x8_t m0 = vdup_n_u8(mask[0]);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 1;
src0 += src0_stride;
src1 += src1_stride;
dst += dst_stride;
- }
+ } while (--h != 0);
} else if (w == 4) {
- for (int i = 0; i < h; i += 2) {
- __builtin_prefetch(src0 + 0 * src0_stride);
- __builtin_prefetch(src0 + 1 * src0_stride);
- __builtin_prefetch(src1 + 0 * src1_stride);
- __builtin_prefetch(src1 + 1 * src1_stride);
- const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]);
- const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]);
- const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2));
- const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]);
- const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
- const uint8x8_t max_minus_m =
- vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
- tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
- tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
- res = vmull_u8(m, tmp0);
- res = vmlal_u8(res, max_minus_m, tmp1);
- const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
- store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
- store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
- src0 += (2 * src0_stride);
- src1 += (2 * src1_stride);
- dst += (2 * dst_stride);
- }
- } else if (w == 2) {
- for (int i = 0; i < h; i += 2) {
- __builtin_prefetch(src0 + 0 * src0_stride);
- __builtin_prefetch(src0 + 1 * src0_stride);
- __builtin_prefetch(src1 + 0 * src1_stride);
- __builtin_prefetch(src1 + 1 * src1_stride);
- const uint8x8_t m1 = vdup_n_u8(mask[i]);
- const uint8x8_t m2 = vdup_n_u8(mask[i + 1]);
- const uint16x4x2_t m_trn =
- vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2));
- const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]);
- const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]);
- const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]);
- const uint16x4x2_t max_minus_m_trn = vtrn_u16(
- vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
- const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
- tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
- tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
- res = vmull_u8(m, tmp0);
- res = vmlal_u8(res, max_minus_m, tmp1);
- const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
- store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
- store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
- src0 += (2 * src0_stride);
- src1 += (2 * src1_stride);
- dst += (2 * dst_stride);
- }
+ do {
+ const uint16x4_t m0 = vdup_n_u16((uint16_t)mask[0]);
+ const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[1]);
+ const uint8x8_t m = vmovn_u16(vcombine_u16(m0, m1));
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 16) {
+ do {
+ uint16x4_t m0 = vdup_n_u16(0);
+ m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+ uint8x8_t m =
+ vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+ uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
+
+ store_unaligned_u8_2x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, w, h);
}
}
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 4397a47e7..4465e0b33 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -10,6 +10,8 @@
*/
#include "aom_dsp/aom_simd.h"
+#include "aom_dsp/arm/mem_neon.h"
+
#define SIMD_FUNC(name) name##_neon
#include "av1/common/cdef_block_simd.h"
@@ -28,6 +30,348 @@ void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
}
}
+static INLINE int16x8_t v128_from_64_neon(int64_t a, int64_t b) {
+ return vreinterpretq_s16_s64(vcombine_s64(vcreate_s64(a), vcreate_s64(b)));
+}
+
+#define SHL_HIGH_NEON(n) \
+ static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \
+ int64x2_t a_s64 = vreinterpretq_s64_s16(a); \
+ return v128_from_64_neon( \
+ 0, vget_lane_u64(vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \
+ (n - 8) * 8), \
+ 0)); \
+ }
+
+#define SHL_NEON(n) \
+ static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \
+ int64x2_t a_s64 = vreinterpretq_s64_s16(a); \
+ return v128_from_64_neon( \
+ 0, vget_lane_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), 0)); \
+ }
+
+#define SHL_LOW_NEON(n) \
+ static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \
+ int64x2_t a_s64 = vreinterpretq_s64_s16(a); \
+ return v128_from_64_neon( \
+ vget_lane_u64( \
+ vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), 0), \
+ vget_lane_u64( \
+ vorr_u64( \
+ vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \
+ vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \
+ (8 - n) * 8)), \
+ 0)); \
+ }
+
+SHL_HIGH_NEON(14)
+SHL_HIGH_NEON(12)
+SHL_HIGH_NEON(10)
+SHL_NEON(8)
+SHL_LOW_NEON(6)
+SHL_LOW_NEON(4)
+SHL_LOW_NEON(2)
+
+#define v128_shl_n_byte_neon(a, n) v128_shl_##n##_byte_neon(a)
+
+#define SHR_HIGH_NEON(n) \
+ static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \
+ int64x2_t a_s64 = vreinterpretq_s64_s16(a); \
+ return v128_from_64_neon( \
+ vget_lane_u64(vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \
+ (n - 8) * 8), \
+ 0), \
+ 0); \
+ }
+
+#define SHR_NEON(n) \
+ static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \
+ int64x2_t a_s64 = vreinterpretq_s64_s16(a); \
+ return v128_from_64_neon( \
+ vget_lane_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), 0), 0); \
+ }
+
+#define SHR_LOW_NEON(n) \
+ static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \
+ int64x2_t a_s64 = vreinterpretq_s64_s16(a); \
+ return v128_from_64_neon( \
+ vget_lane_u64( \
+ vorr_u64( \
+ vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), \
+ vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \
+ (8 - n) * 8)), \
+ 0), \
+ vget_lane_u64( \
+ vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \
+ 0)); \
+ }
+
+SHR_HIGH_NEON(14)
+SHR_HIGH_NEON(12)
+SHR_HIGH_NEON(10)
+SHR_NEON(8)
+SHR_LOW_NEON(6)
+SHR_LOW_NEON(4)
+SHR_LOW_NEON(2)
+
+#define v128_shr_n_byte_neon(a, n) v128_shr_##n##_byte_neon(a)
+
+static INLINE uint32x4_t v128_madd_s16_neon(int16x8_t a, int16x8_t b) {
+ uint32x4_t t1 =
+ vreinterpretq_u32_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b)));
+ uint32x4_t t2 =
+ vreinterpretq_u32_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b)));
+#if AOM_ARCH_AARCH64
+ return vpaddq_u32(t1, t2);
+#else
+ return vcombine_u32(vpadd_u32(vget_low_u32(t1), vget_high_u32(t1)),
+ vpadd_u32(vget_low_u32(t2), vget_high_u32(t2)));
+#endif
+}
+
+// partial A is a 16-bit vector of the form:
+// [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+// [0 y1 y2 y3 y4 y5 y6 y7].
+// This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+// (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+// and const2.
+static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
+ int16x8_t partialb,
+ uint32x4_t const1,
+ uint32x4_t const2) {
+ int16x8_t tmp;
+ // Reverse partial B.
+ uint8x16_t pattern = vreinterpretq_u8_u64(
+ vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
+ vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
+
+#if AOM_ARCH_AARCH64
+ partialb =
+ vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
+#else
+ int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
+ vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
+ int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
+ int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
+ partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
+#endif
+
+ // Interleave the x and y values of identical indices and pair x8 with 0.
+ tmp = partiala;
+ partiala = vzipq_s16(partiala, partialb).val[0];
+ partialb = vzipq_s16(tmp, partialb).val[1];
+ // Square and add the corresponding x and y values.
+ uint32x4_t partiala_u32 = v128_madd_s16_neon(partiala, partiala);
+ uint32x4_t partialb_u32 = v128_madd_s16_neon(partialb, partialb);
+
+ // Multiply by constant.
+ partiala_u32 = vmulq_u32(partiala_u32, const1);
+ partialb_u32 = vmulq_u32(partialb_u32, const2);
+
+ // Sum all results.
+ partiala_u32 = vaddq_u32(partiala_u32, partialb_u32);
+ return partiala_u32;
+}
+
+static INLINE uint64x2_t ziplo_u64(uint32x4_t a, uint32x4_t b) {
+ return vcombine_u64(vget_low_u64(vreinterpretq_u64_u32(a)),
+ vget_low_u64(vreinterpretq_u64_u32(b)));
+}
+
+static INLINE uint64x2_t ziphi_u64(uint32x4_t a, uint32x4_t b) {
+ return vcombine_u64(vget_high_u64(vreinterpretq_u64_u32(a)),
+ vget_high_u64(vreinterpretq_u64_u32(b)));
+}
+
+static INLINE uint32x4_t hsum4_neon(uint32x4_t x0, uint32x4_t x1, uint32x4_t x2,
+ uint32x4_t x3) {
+ uint32x4_t t0, t1, t2, t3;
+ t0 = vzipq_u32(x0, x1).val[0];
+ t1 = vzipq_u32(x2, x3).val[0];
+ t2 = vzipq_u32(x0, x1).val[1];
+ t3 = vzipq_u32(x2, x3).val[1];
+ x0 = vreinterpretq_u32_u64(ziplo_u64(t0, t1));
+ x1 = vreinterpretq_u32_u64(ziphi_u64(t0, t1));
+ x2 = vreinterpretq_u32_u64(ziplo_u64(t2, t3));
+ x3 = vreinterpretq_u32_u64(ziphi_u64(t2, t3));
+ return vaddq_u32(vaddq_u32(x0, x1), vaddq_u32(x2, x3));
+}
+
+static INLINE uint32x4_t compute_directions_neon(int16x8_t lines[8],
+ uint32_t cost[4]) {
+ int16x8_t partial4a, partial4b, partial5a, partial5b, partial6, partial7a,
+ partial7b;
+ int16x8_t tmp;
+
+ // Partial sums for lines 0 and 1.
+ partial4a = v128_shl_n_byte_neon(lines[0], 14);
+ partial4b = v128_shr_n_byte_neon(lines[0], 2);
+ partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[1], 12));
+ partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[1], 4));
+ tmp = vaddq_s16(lines[0], lines[1]);
+ partial5a = v128_shl_n_byte_neon(tmp, 10);
+ partial5b = v128_shr_n_byte_neon(tmp, 6);
+ partial7a = v128_shl_n_byte_neon(tmp, 4);
+ partial7b = v128_shr_n_byte_neon(tmp, 12);
+ partial6 = tmp;
+
+ // Partial sums for lines 2 and 3.
+ partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[2], 10));
+ partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[2], 6));
+ partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[3], 8));
+ partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[3], 8));
+ tmp = vaddq_s16(lines[2], lines[3]);
+ partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 8));
+ partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 8));
+ partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 6));
+ partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 10));
+ partial6 = vaddq_s16(partial6, tmp);
+
+ // Partial sums for lines 4 and 5.
+ partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[4], 6));
+ partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[4], 10));
+ partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[5], 4));
+ partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[5], 12));
+ tmp = vaddq_s16(lines[4], lines[5]);
+ partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 6));
+ partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 10));
+ partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 8));
+ partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 8));
+ partial6 = vaddq_s16(partial6, tmp);
+
+ // Partial sums for lines 6 and 7.
+ partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[6], 2));
+ partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[6], 14));
+ partial4a = vaddq_s16(partial4a, lines[7]);
+ tmp = vaddq_s16(lines[6], lines[7]);
+ partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 4));
+ partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 12));
+ partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 10));
+ partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 6));
+ partial6 = vaddq_s16(partial6, tmp);
+
+ uint32x4_t const0 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
+ vcreate_u64((uint64_t)210 << 32 | 280)));
+ uint32x4_t const1 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
+ vcreate_u64((uint64_t)105 << 32 | 120)));
+ uint32x4_t const2 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
+ uint32x4_t const3 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
+ vcreate_u64((uint64_t)105 << 32 | 105)));
+
+ // Compute costs in terms of partial sums.
+ uint32x4_t partial4a_u32 =
+ fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
+ uint32x4_t partial7a_u32 =
+ fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
+ uint32x4_t partial5a_u32 =
+ fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
+ uint32x4_t partial6_u32 = v128_madd_s16_neon(partial6, partial6);
+ partial6_u32 = vmulq_u32(partial6_u32, vdupq_n_u32(105));
+
+ partial4a_u32 =
+ hsum4_neon(partial4a_u32, partial5a_u32, partial6_u32, partial7a_u32);
+ vst1q_u32(cost, partial4a_u32);
+ return partial4a_u32;
+}
+
+static INLINE int64x2_t ziplo_s64(int32x4_t a, int32x4_t b) {
+ return vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(a)),
+ vget_low_s64(vreinterpretq_s64_s32(b)));
+}
+
+static INLINE int64x2_t ziphi_s64(int32x4_t a, int32x4_t b) {
+ return vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(a)),
+ vget_high_s64(vreinterpretq_s64_s32(b)));
+}
+
+// Transpose and reverse the order of the lines -- equivalent to a 90-degree
+// counter-clockwise rotation of the pixels.
+static INLINE void array_reverse_transpose_8x8_neon(int16x8_t *in,
+ int16x8_t *res) {
+ const int32x4_t tr0_0 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[0]);
+ const int32x4_t tr0_1 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[0]);
+ const int32x4_t tr0_2 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[1]);
+ const int32x4_t tr0_3 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[1]);
+ const int32x4_t tr0_4 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[0]);
+ const int32x4_t tr0_5 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[0]);
+ const int32x4_t tr0_6 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[1]);
+ const int32x4_t tr0_7 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[1]);
+
+ const int32x4_t tr1_0 = vzipq_s32(tr0_0, tr0_1).val[0];
+ const int32x4_t tr1_1 = vzipq_s32(tr0_4, tr0_5).val[0];
+ const int32x4_t tr1_2 = vzipq_s32(tr0_0, tr0_1).val[1];
+ const int32x4_t tr1_3 = vzipq_s32(tr0_4, tr0_5).val[1];
+ const int32x4_t tr1_4 = vzipq_s32(tr0_2, tr0_3).val[0];
+ const int32x4_t tr1_5 = vzipq_s32(tr0_6, tr0_7).val[0];
+ const int32x4_t tr1_6 = vzipq_s32(tr0_2, tr0_3).val[1];
+ const int32x4_t tr1_7 = vzipq_s32(tr0_6, tr0_7).val[1];
+
+ res[7] = vreinterpretq_s16_s64(ziplo_s64(tr1_0, tr1_1));
+ res[6] = vreinterpretq_s16_s64(ziphi_s64(tr1_0, tr1_1));
+ res[5] = vreinterpretq_s16_s64(ziplo_s64(tr1_2, tr1_3));
+ res[4] = vreinterpretq_s16_s64(ziphi_s64(tr1_2, tr1_3));
+ res[3] = vreinterpretq_s16_s64(ziplo_s64(tr1_4, tr1_5));
+ res[2] = vreinterpretq_s16_s64(ziphi_s64(tr1_4, tr1_5));
+ res[1] = vreinterpretq_s16_s64(ziplo_s64(tr1_6, tr1_7));
+ res[0] = vreinterpretq_s16_s64(ziphi_s64(tr1_6, tr1_7));
+}
+
+static INLINE uint32_t compute_best_dir(uint8x16_t a) {
+ uint8x16_t idx =
+ vandq_u8(a, vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
+#if AOM_ARCH_AARCH64
+ return vaddv_u8(vget_low_u8(idx)) + (vaddv_u8(vget_high_u8(idx)) << 8);
+#else
+ uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(idx)));
+ uint8x16_t s = vreinterpretq_u8_u64(m);
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vzip_u8(vget_low_u8(s), vget_high_u8(s)).val[0]), 0);
+#endif
+}
+
+int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift) {
+ uint32_t cost[8];
+ uint32_t best_cost = 0;
+ int best_dir = 0;
+ int16x8_t lines[8];
+ for (int i = 0; i < 8; i++) {
+ uint16x8_t s = vld1q_u16(&img[i * stride]);
+ lines[i] = vreinterpretq_s16_u16(
+ vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
+ }
+
+ // Compute "mostly vertical" directions.
+ uint32x4_t cost47 = compute_directions_neon(lines, cost + 4);
+
+ array_reverse_transpose_8x8_neon(lines, lines);
+
+ // Compute "mostly horizontal" directions.
+ uint32x4_t cost03 = compute_directions_neon(lines, cost);
+
+ uint32x4_t max_cost = vmaxq_u32(cost03, cost47);
+ max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 2));
+ max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 1));
+ best_cost = vgetq_lane_u32(max_cost, 0);
+ uint16x8_t idx = vcombine_u16(vqmovn_u32(vceqq_u32(max_cost, cost03)),
+ vqmovn_u32(vceqq_u32(max_cost, cost47)));
+ uint8x16_t idx_u8 = vcombine_u8(vqmovn_u16(idx), vqmovn_u16(idx));
+ best_dir = compute_best_dir(idx_u8);
+ best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros
+
+ // Difference between the optimal variance and the variance along the
+ // orthogonal direction. Again, the sum(x^2) terms cancel out.
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ // We'd normally divide by 840, but dividing by 1024 is close enough
+ // for what we're going to do with this.
+ *var >>= 10;
+ return best_dir;
+}
+
void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
int stride, int32_t *var_out_1st,
int32_t *var_out_2nd, int coeff_shift,
@@ -38,3 +382,532 @@ void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
// Process second 8x8.
*out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
}
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
+ unsigned int threshold, int adjdamp) {
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, b));
+ const int16x8_t sign = vshrq_n_s16(diff, 15);
+ diff = vabsq_s16(diff);
+ const uint16x8_t s =
+ vqsubq_u16(vdupq_n_u16(threshold),
+ vreinterpretq_u16_s16(vshlq_s16(diff, vdupq_n_s16(-adjdamp))));
+ return veorq_s16(vaddq_s16(sign, vminq_s16(diff, vreinterpretq_s16_u16(s))),
+ sign);
+}
+
+static INLINE uint16x8_t get_max_primary(const int is_lowbd, uint16x8_t *tap,
+ uint16x8_t max,
+ uint16x8_t cdef_large_value_mask) {
+ if (is_lowbd) {
+ uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]);
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3]));
+ /* The source is 16 bits, however, we only really care about the lower
+ 8 bits. The upper 8 bits contain the "large" flag. After the final
+ primary max has been calculated, zero out the upper 8 bits. Use this
+ to find the "16 bit" max. */
+ max = vmaxq_u16(
+ max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask));
+ } else {
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask));
+ }
+ return max;
+}
+
+static INLINE uint16x8_t get_max_secondary(const int is_lowbd, uint16x8_t *tap,
+ uint16x8_t max,
+ uint16x8_t cdef_large_value_mask) {
+ if (is_lowbd) {
+ uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]);
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[4]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[5]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[6]));
+ max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[7]));
+ /* The source is 16 bits, however, we only really care about the lower
+ 8 bits. The upper 8 bits contain the "large" flag. After the final
+ primary max has been calculated, zero out the upper 8 bits. Use this
+ to find the "16 bit" max. */
+ max = vmaxq_u16(
+ max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask));
+ } else {
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[4], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[5], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[6], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(tap[7], cdef_large_value_mask));
+ }
+ return max;
+}
+
+static INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir, int pri_damping,
+ int sec_damping, int coeff_shift,
+ int height, int enable_primary,
+ int enable_secondary) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+ const int clipping_required = enable_primary && enable_secondary;
+ uint16x8_t max, min;
+ const uint16x8_t cdef_large_value_mask =
+ vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
+
+ if (enable_primary && pri_strength) {
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ }
+ if (enable_secondary && sec_strength) {
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ }
+
+ int h = height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+ max = min = s;
+
+ if (enable_primary) {
+ uint16x8_t tap[4];
+
+ // Primary near taps
+ tap[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
+ tap[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
+ int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping);
+ int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ p0 = vaddq_s16(p0, p1);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0]));
+
+ // Primary far taps
+ tap[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
+ tap[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
+ p0 = constrain16(tap[2], s, pri_strength, pri_damping);
+ p1 = constrain16(tap[3], s, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ p0 = vaddq_s16(p0, p1);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1]));
+
+ if (clipping_required) {
+ max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = vminq_u16(min, tap[0]);
+ min = vminq_u16(min, tap[1]);
+ min = vminq_u16(min, tap[2]);
+ min = vminq_u16(min, tap[3]);
+ }
+ }
+
+ if (enable_secondary) {
+ uint16x8_t tap[8];
+
+ // Secondary near taps
+ tap[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
+ tap[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
+ tap[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
+ tap[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
+ int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping);
+ int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping);
+ int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping);
+ int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ p0 = vaddq_s16(p0, p1);
+ p2 = vaddq_s16(p2, p3);
+ p0 = vaddq_s16(p0, p2);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0]));
+
+ // Secondary far taps
+ tap[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
+ tap[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
+ tap[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
+ tap[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
+ p0 = constrain16(tap[4], s, sec_strength, sec_damping);
+ p1 = constrain16(tap[5], s, sec_strength, sec_damping);
+ p2 = constrain16(tap[6], s, sec_strength, sec_damping);
+ p3 = constrain16(tap[7], s, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ p0 = vaddq_s16(p0, p1);
+ p2 = vaddq_s16(p2, p3);
+ p0 = vaddq_s16(p0, p2);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1]));
+
+ if (clipping_required) {
+ max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = vminq_u16(min, tap[0]);
+ min = vminq_u16(min, tap[1]);
+ min = vminq_u16(min, tap[2]);
+ min = vminq_u16(min, tap[3]);
+ min = vminq_u16(min, tap[4]);
+ min = vminq_u16(min, tap[5]);
+ min = vminq_u16(min, tap[6]);
+ min = vminq_u16(min, tap[7]);
+ }
+ }
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8));
+ res = vshrq_n_s16(res, 4);
+ res = vaddq_s16(vreinterpretq_s16_u16(s), res);
+
+ if (clipping_required) {
+ res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
+ vreinterpretq_s16_u16(max));
+ }
+
+ if (is_lowbd) {
+ const uint8x8_t res_128 = vqmovun_s16(res);
+ store_unaligned_u8_4x2(dst8, dstride, res_128);
+ } else {
+ store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+ }
+
+ in += 2 * CDEF_BSTRIDE;
+ dst8 += 2 * dstride;
+ dst16 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+}
+
+static INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir, int pri_damping,
+ int sec_damping, int coeff_shift,
+ int height, int enable_primary,
+ int enable_secondary) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+ const int clipping_required = enable_primary && enable_secondary;
+ uint16x8_t max, min;
+ const uint16x8_t cdef_large_value_mask =
+ vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
+
+ if (enable_primary && pri_strength) {
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ }
+ if (enable_secondary && sec_strength) {
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ }
+
+ int h = height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = vld1q_u16(in);
+ max = min = s;
+
+ if (enable_primary) {
+ uint16x8_t tap[4];
+
+ // Primary near taps
+ tap[0] = vld1q_u16(in + po1);
+ tap[1] = vld1q_u16(in - po1);
+ int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping);
+ int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ p0 = vaddq_s16(p0, p1);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0]));
+
+ // Primary far taps
+ tap[2] = vld1q_u16(in + po2);
+ p0 = constrain16(tap[2], s, pri_strength, pri_damping);
+ tap[3] = vld1q_u16(in - po2);
+ p1 = constrain16(tap[3], s, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ p0 = vaddq_s16(p0, p1);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1]));
+ if (clipping_required) {
+ max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = vminq_u16(min, tap[0]);
+ min = vminq_u16(min, tap[1]);
+ min = vminq_u16(min, tap[2]);
+ min = vminq_u16(min, tap[3]);
+ }
+ }
+
+ if (enable_secondary) {
+ uint16x8_t tap[8];
+
+ // Secondary near taps
+ tap[0] = vld1q_u16(in + s1o1);
+ tap[1] = vld1q_u16(in - s1o1);
+ tap[2] = vld1q_u16(in + s2o1);
+ tap[3] = vld1q_u16(in - s2o1);
+ int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping);
+ int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping);
+ int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping);
+ int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ p0 = vaddq_s16(p0, p1);
+ p2 = vaddq_s16(p2, p3);
+ p0 = vaddq_s16(p0, p2);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0]));
+
+ // Secondary far taps
+ tap[4] = vld1q_u16(in + s1o2);
+ tap[5] = vld1q_u16(in - s1o2);
+ tap[6] = vld1q_u16(in + s2o2);
+ tap[7] = vld1q_u16(in - s2o2);
+ p0 = constrain16(tap[4], s, sec_strength, sec_damping);
+ p1 = constrain16(tap[5], s, sec_strength, sec_damping);
+ p2 = constrain16(tap[6], s, sec_strength, sec_damping);
+ p3 = constrain16(tap[7], s, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ p0 = vaddq_s16(p0, p1);
+ p2 = vaddq_s16(p2, p3);
+ p0 = vaddq_s16(p0, p2);
+ sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1]));
+
+ if (clipping_required) {
+ max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = vminq_u16(min, tap[0]);
+ min = vminq_u16(min, tap[1]);
+ min = vminq_u16(min, tap[2]);
+ min = vminq_u16(min, tap[3]);
+ min = vminq_u16(min, tap[4]);
+ min = vminq_u16(min, tap[5]);
+ min = vminq_u16(min, tap[6]);
+ min = vminq_u16(min, tap[7]);
+ }
+ }
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8));
+ res = vshrq_n_s16(res, 4);
+ res = vaddq_s16(vreinterpretq_s16_u16(s), res);
+ if (clipping_required) {
+ res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
+ vreinterpretq_s16_u16(max));
+ }
+
+ if (is_lowbd) {
+ const uint8x8_t res_128 = vqmovun_s16(res);
+ vst1_u8(dst8, res_128);
+ } else {
+ vst1q_u16(dst16, vreinterpretq_u16_s16(res));
+ }
+
+ in += CDEF_BSTRIDE;
+ dst8 += dstride;
+ dst16 += dstride;
+ } while (--h != 0);
+}
+
+static INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int height) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = height;
+ do {
+ const uint16x8_t row = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+ if (is_lowbd) {
+ const uint8x8_t res_128 = vqmovn_u16(row);
+ store_unaligned_u8_4x2(dst8, dstride, res_128);
+ } else {
+ store_unaligned_u16_4x2(dst16, dstride, row);
+ }
+
+ in += 2 * CDEF_BSTRIDE;
+ dst8 += 2 * dstride;
+ dst16 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+}
+
+static INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int height) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = height;
+ do {
+ const uint16x8_t row = vld1q_u16(in);
+ if (is_lowbd) {
+ const uint8x8_t res_128 = vqmovn_u16(row);
+ vst1_u8(dst8, res_128);
+ } else {
+ vst1q_u16(dst16, row);
+ }
+
+ in += CDEF_BSTRIDE;
+ dst8 += dstride;
+ dst16 += dstride;
+ } while (--h != 0);
+}
+
+void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ }
+}
+
+void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ }
+}
+
+void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ }
+}
+
+void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)pri_strength;
+ (void)sec_strength;
+ (void)dir;
+ (void)pri_damping;
+ (void)sec_damping;
+ (void)coeff_shift;
+ (void)block_width;
+ if (block_width == 8) {
+ copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
+ } else {
+ copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
+ }
+}
+
+void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ }
+}
+
+void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ }
+}
+
+void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ }
+}
+
+void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)pri_strength;
+ (void)sec_strength;
+ (void)dir;
+ (void)pri_damping;
+ (void)sec_damping;
+ (void)coeff_shift;
+ (void)block_width;
+ if (block_width == 8) {
+ copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
+ } else {
+ copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
+ }
+}
diff --git a/av1/common/arm/compound_convolve_neon.c b/av1/common/arm/compound_convolve_neon.c
new file mode 100644
index 000000000..2e6af68da
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon.c
@@ -0,0 +1,2731 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t x_filter,
+ const int16x4_t horiz_const) {
+ int16x4_t sum = horiz_const;
+ sum = vmla_lane_s16(sum, s0, x_filter, 0);
+ sum = vmla_lane_s16(sum, s1, x_filter, 1);
+ sum = vmla_lane_s16(sum, s2, x_filter, 2);
+ sum = vmla_lane_s16(sum, s3, x_filter, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshr_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t x_filter,
+ const int16x8_t horiz_const) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int16x8_t sum = horiz_const;
+ sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
+ sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
+ sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
+ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+ const int16_t *x_filter_ptr, const int im_h, int w) {
+ const int bd = 8;
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w == 4) {
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
+
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ do {
+ const uint8_t *s;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s = src_ptr + 7;
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
+ x_filter, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
+ x_filter, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
+ x_filter, horiz_const);
+ int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, horiz_const);
+ int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, horiz_const);
+ int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, horiz_const);
+ int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, horiz_const);
+
+ transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ height -= 8;
+ } while (height > 8);
+#endif // AOM_ARCH_AARCH64
+
+ do {
+ const uint8_t *s;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(src_ptr);
+ int16x8_t s0 =
+ vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ s = src_ptr + 8;
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
+ vst1q_s16(d, d0);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+ x_filter_ptr, im_h, w);
+
+ if (clamped_y_taps == 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+ const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+ uint16x4_t d1 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+ uint16x4_t d2 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+ uint16x4_t d3 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_dist_wtd_avg_4x4(
+ dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
+ vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
+
+ store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
+ store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
+ store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
+ store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ CONV_BUF_TYPE *d = dst;
+ uint8_t *d_u8 = dst8;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+ uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+ uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+ uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset,
+ vreinterpretq_s16_u16(round_offset_vec),
+ &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+ const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+ uint16x4_t d1 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+ uint16x4_t d2 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+ uint16x4_t d3 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ vreinterpretq_s16_u16(round_offset_vec), &d01,
+ &d23);
+
+ store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
+ store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
+ store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
+ store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ CONV_BUF_TYPE *d = dst;
+ uint8_t *d_u8 = dst8;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+ uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+ uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+ uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
+ &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
+ int src_stride, int w, int h,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+ const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+ uint16x4_t d1 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+ uint16x4_t d2 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+ uint16x4_t d3 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+ store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ CONV_BUF_TYPE *d = dst;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+ uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+ uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+ uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w,
+ int h, ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
+ src, src_stride, dst8, dst8_stride, w, h, conv_params);
+ } else {
+ dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
+ h, conv_params);
+ }
+ } else {
+ dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
+ }
+}
+
+static INLINE uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t x_filter,
+ const int16x4_t round_offset) {
+ int16x4_t sum = vmul_lane_s16(s0, x_filter, 0);
+ sum = vmla_lane_s16(sum, s1, x_filter, 1);
+ sum = vmla_lane_s16(sum, s2, x_filter, 2);
+ sum = vmla_lane_s16(sum, s3, x_filter, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t x_filter,
+ const int16x8_t round_offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
+ sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
+ sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+ __builtin_prefetch(dst8_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(dst8_ptr, d01, 0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ } while (--height != 0);
+ } else {
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (height >= 8) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
+ s += 7;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, round_offset_vec);
+
+ transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+ bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+ &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
+ d7_u8);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ dst8_ptr += 8 * dst8_stride;
+ height -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ while (height > 0) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ __builtin_prefetch(d);
+
+ s += 8;
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ height--;
+ }
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+ __builtin_prefetch(dst8_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(dst8_ptr, d01, 0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ } while (--height != 0);
+ } else {
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (height >= 8) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
+ s += 7;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, round_offset_vec);
+
+ transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+ round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
+ d7_u8);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ dst8_ptr += 8 * dst8_stride;
+ height -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ while (height > 0) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ __builtin_prefetch(d);
+
+ s += 8;
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ height--;
+ }
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_neon(
+ const uint8_t *src, int src_stride, int w, int h,
+ const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+ vget_low_s16(round_offset_vec));
+
+ vst1_u16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (height >= 8) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
+ s += 7;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, round_offset_vec);
+
+ transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ height -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ while (height > 0) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ __builtin_prefetch(d);
+
+ s = src_ptr + 8;
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+
+ vst1q_u16(d, d0);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ height--;
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
+ w, h, filter_params_x, subpel_x_qn,
+ conv_params);
+ } else {
+ dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ }
+ } else {
+ dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ }
+}
+
+static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter,
+ const int16x4_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ // Filter values at indices 0 and 7 are 0.
+ int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter,
+ const int16x8_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ // Filter values at indices 0 and 7 are 0.
+ int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01, &d23);
+
+ store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+ store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+ store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+ store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01, 0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr + (5 * src_stride);
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+ uint16x8_t d1 =
+ convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+ uint16x8_t d2 =
+ convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+ uint16x8_t d3 =
+ convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+ uint16x8_t d4 =
+ convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+ uint16x8_t d5 =
+ convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+ uint16x8_t d6 =
+ convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+ bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+ &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01, &d23);
+
+ store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+ store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+ store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+ store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01, 0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr + (5 * src_stride);
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+ uint16x8_t d1 =
+ convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+ uint16x8_t d2 =
+ convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+ uint16x8_t d3 =
+ convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+ uint16x8_t d4 =
+ convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+ uint16x8_t d5 =
+ convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+ uint16x8_t d6 =
+ convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+ round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
+ int src_stride, int w, int h,
+ const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ vst1_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr + (5 * src_stride);
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+ uint16x8_t d1 =
+ convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+ uint16x8_t d2 =
+ convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+ uint16x8_t d3 =
+ convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+ uint16x8_t d4 =
+ convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+ uint16x8_t d5 =
+ convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+ uint16x8_t d6 =
+ convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+ round_offset_vec);
+
+ store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+
+ vst1q_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t y_filter,
+ const int16x4_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t y_filter,
+ const int16x8_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+ __builtin_prefetch(d_u8 + 1 * dst8_stride);
+ __builtin_prefetch(d_u8 + 2 * dst8_stride);
+ __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01, &d23);
+
+ store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+ store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+ store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+ store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d);
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01, 0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ __builtin_prefetch(s + 4 * src_stride);
+ __builtin_prefetch(s + 5 * src_stride);
+ __builtin_prefetch(s + 6 * src_stride);
+ __builtin_prefetch(s + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+ y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter, round_offset_vec);
+
+ __builtin_prefetch(d + 0 * dst8_stride);
+ __builtin_prefetch(d + 1 * dst8_stride);
+ __builtin_prefetch(d + 2 * dst8_stride);
+ __builtin_prefetch(d + 3 * dst8_stride);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+ bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+ &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ __builtin_prefetch(d);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+ __builtin_prefetch(d_u8 + 1 * dst8_stride);
+ __builtin_prefetch(d_u8 + 2 * dst8_stride);
+ __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01, &d23);
+
+ store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+ store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+ store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+ store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d);
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01, 0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ __builtin_prefetch(s + 4 * src_stride);
+ __builtin_prefetch(s + 5 * src_stride);
+ __builtin_prefetch(s + 6 * src_stride);
+ __builtin_prefetch(s + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+ y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter, round_offset_vec);
+
+ __builtin_prefetch(d + 0 * dst8_stride);
+ __builtin_prefetch(d + 1 * dst8_stride);
+ __builtin_prefetch(d + 2 * dst8_stride);
+ __builtin_prefetch(d + 3 * dst8_stride);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+ round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ __builtin_prefetch(d);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
+ int src_stride, int w, int h,
+ const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ vst1_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ __builtin_prefetch(s + 4 * src_stride);
+ __builtin_prefetch(s + 5 * src_stride);
+ __builtin_prefetch(s + 6 * src_stride);
+ __builtin_prefetch(s + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+ y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter, round_offset_vec);
+
+ store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ vst1q_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ // Vertical filter.
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ // Filter values are even, so downshift by 1 to reduce intermediate
+ // precision requirements.
+ const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
+
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+ if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
+ src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
+ conv_params);
+ } else {
+ dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
+ dst8, dst8_stride, w, h, y_filter,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
+ y_filter, conv_params);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
+ dst8_stride, w, h, y_filter,
+ conv_params);
+ } else {
+ dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
+ dst8_stride, w, h, y_filter,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
+ conv_params);
+ }
+ }
+}
diff --git a/av1/common/arm/compound_convolve_neon.h b/av1/common/arm/compound_convolve_neon.h
new file mode 100644
index 000000000..cff6838fe
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon.h
@@ -0,0 +1,1172 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "av1/common/convolve.h"
+#include "av1/common/enums.h"
+#include "av1/common/filter.h"
+
+static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
+ blend0 = vmlal_n_u16(blend0, d0, bck_offset);
+
+ uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
+
+ int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
+
+ int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
+
+ *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+ const int16x4_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint16x4_t avg0 = vhadd_u16(dd0, d0);
+
+ int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
+
+ int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
+
+ *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x8_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
+ blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
+ uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
+ blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
+
+ uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+ const int16x8_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint16x8_t avg0 = vhaddq_u16(dd0, d0);
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_4x4(
+ uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
+ uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+ const uint16_t fwd_offset, const uint16_t bck_offset,
+ const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
+ uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
+ blend0 = vmlal_n_u16(blend0, d0, bck_offset);
+ uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset);
+ blend1 = vmlal_n_u16(blend1, d1, bck_offset);
+ uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset);
+ blend2 = vmlal_n_u16(blend2, d2, bck_offset);
+ uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset);
+ blend3 = vmlal_n_u16(blend3, d3, bck_offset);
+
+ uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
+ uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS);
+ uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS);
+ uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS);
+
+ int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
+ int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
+
+ dst_01 = vsubq_s16(dst_01, round_offset);
+ dst_23 = vsubq_s16(dst_23, round_offset);
+
+ *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
+ *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
+ uint16x4_t dd2, uint16x4_t dd3,
+ uint16x4_t d0, uint16x4_t d1,
+ uint16x4_t d2, uint16x4_t d3,
+ const int16x8_t round_offset,
+ uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
+ uint16x4_t avg0 = vhadd_u16(dd0, d0);
+ uint16x4_t avg1 = vhadd_u16(dd1, d1);
+ uint16x4_t avg2 = vhadd_u16(dd2, d2);
+ uint16x4_t avg3 = vhadd_u16(dd3, d3);
+
+ int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
+ int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
+
+ dst_01 = vsubq_s16(dst_01, round_offset);
+ dst_23 = vsubq_s16(dst_23, round_offset);
+
+ *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
+ *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_8x4(
+ uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
+ uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+ const uint16_t fwd_offset, const uint16_t bck_offset,
+ const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8,
+ uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
+ uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
+ blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
+ uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
+ blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
+
+ uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset);
+ blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset);
+ uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset);
+ blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset);
+
+ uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset);
+ blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset);
+ uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset);
+ blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset);
+
+ uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset);
+ blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset);
+ uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset);
+ blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset);
+
+ uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
+ uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS));
+ uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS));
+ uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS));
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+ int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
+ int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
+ int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+ *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
+ *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
+ *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
+ uint16x8_t dd2, uint16x8_t dd3,
+ uint16x8_t d0, uint16x8_t d1,
+ uint16x8_t d2, uint16x8_t d3,
+ const int16x8_t round_offset,
+ uint8x8_t *d0_u8, uint8x8_t *d1_u8,
+ uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
+ uint16x8_t avg0 = vhaddq_u16(dd0, d0);
+ uint16x8_t avg1 = vhaddq_u16(dd1, d1);
+ uint16x8_t avg2 = vhaddq_u16(dd2, d2);
+ uint16x8_t avg3 = vhaddq_u16(dd3, d3);
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+ int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
+ int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
+ int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+ *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
+ *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
+ *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint16x4_t
+convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = offset_const;
+ // Filter values at indices 0 and 7 are 0.
+ sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = offset_const;
+ // Filter values at indices 0 and 7 are 0.
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = offset_const;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x4_t d1 =
+ convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x4_t d2 =
+ convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x4_t d3 =
+ convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8, 0);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x4_t d1 =
+ convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x4_t d2 =
+ convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x4_t d3 =
+ convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8, 0);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
+ int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
+ const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x4_t d1 =
+ convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x4_t d2 =
+ convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x4_t d3 =
+ convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ vst1_u16(dst_ptr, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ vst1q_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t
+convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = offset_const;
+ sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = offset_const;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+ int32x4_t sum1 = offset_const;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8, 0);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+ uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_const);
+ uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_const);
+ uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8, 0);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+ uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_const);
+ uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_const);
+ uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
+ int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
+ const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+
+ vst1_u16(dst_ptr, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+ uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_const);
+ uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_const);
+ uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+
+ vst1q_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+#endif // AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/compound_convolve_neon_dotprod.c b/av1/common/arm/compound_convolve_neon_dotprod.c
new file mode 100644
index 000000000..8ab613d98
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product. */
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod(
+ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+ const int16_t *x_filter_ptr, const int im_h, int w) {
+ const int bd = 8;
+ const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+ // Dot product constants and other shims.
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold horiz_const into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
+ permute_tbl);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ dist_wtd_convolve_2d_horiz_neon_dotprod(src_ptr, src_stride, im_block,
+ im_stride, x_filter_ptr, im_h, w);
+
+ if (clamped_y_taps == 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ }
+}
+
+static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product. */
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+ // Dot-product constants and other shims.
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold round_offset into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+ // Dot-product constants and other shims.
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold round_offset into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_neon_dotprod(
+ const uint8_t *src, int src_stride, int w, int h,
+ const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+ // Dot-product constants and other shims.
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold round_offset into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_x_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
+ src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ } else {
+ dist_wtd_convolve_x_avg_neon_dotprod(src, src_stride, dst8, dst8_stride,
+ w, h, filter_params_x, subpel_x_qn,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_x_neon_dotprod(src, src_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ }
+}
diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
new file mode 100644
index 000000000..70d7da9b8
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16_t permute_tbl,
+ const int32x4_t horiz_const) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm(
+ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+ const int16_t *x_filter_ptr, const int im_h, int w) {
+ const int bd = 8;
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride,
+ x_filter_ptr, im_h, w);
+
+ if (clamped_y_taps == 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ }
+}
+
+static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16_t permute_tbl,
+ const int32x4_t round_offset) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t round_offset) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t round_offset_shim = vdupq_n_s32(
+ (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t round_offset_shim = vdupq_n_s32(
+ (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+ store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+ store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+ store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_neon_i8mm(
+ const uint8_t *src, int src_stride, int w, int h,
+ const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t round_offset_shim = vdupq_n_s32(
+ (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_x_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
+ src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ } else {
+ dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w,
+ h, filter_params_x, subpel_x_qn,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ }
+}
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 713aaad16..fa9892273 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -24,251 +24,197 @@
#include "av1/common/filter.h"
#include "av1/common/arm/convolve_neon.h"
-static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t filter) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x4_t sum;
+static INLINE int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x4_t s8, const int16x4_t s9,
+ const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t horiz_const) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
- sum = vmul_lane_s16(s0, filter_lo, 0);
- sum = vmla_lane_s16(sum, s1, filter_lo, 1);
- sum = vmla_lane_s16(sum, s2, filter_lo, 2);
- sum = vmla_lane_s16(sum, s3, filter_lo, 3);
- sum = vmla_lane_s16(sum, s4, filter_hi, 0);
- sum = vmla_lane_s16(sum, s5, filter_hi, 1);
- sum = vmla_lane_s16(sum, s6, filter_hi, 2);
- sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ int32x4_t sum = horiz_const;
+ sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
- return sum;
+ return vqrshrn_n_s32(sum, FILTER_BITS);
}
-#if !AOM_ARCH_AARCH64
-static INLINE uint8x8_t convolve8_x_4x1(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t filter,
- const int16x4_t horiz_const) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x4_t sum = horiz_const;
+static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
+ int src_stride, uint8_t *dst_ptr,
+ const int dst_stride, int w, int h,
+ const int16_t *x_filter_ptr) {
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
- sum = vmla_lane_s16(sum, s0, filter_lo, 0);
- sum = vmla_lane_s16(sum, s1, filter_lo, 1);
- sum = vmla_lane_s16(sum, s2, filter_lo, 2);
- sum = vmla_lane_s16(sum, s3, filter_lo, 3);
- sum = vmla_lane_s16(sum, s4, filter_hi, 0);
- sum = vmla_lane_s16(sum, s5, filter_hi, 1);
- sum = vmla_lane_s16(sum, s6, filter_hi, 2);
- sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
+ // shift by FILTER_BITS - instead of a first rounding right shift by
+ // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+ // ROUND0_BITS.
+ const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
- // We halved the convolution filter values so - 1 from the right shift.
- return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
-}
-#endif // !AOM_ARCH_AARCH64
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int32x4_t convolve12_4_usdot(uint8x16_t samples,
- const int8x16_t filters,
- const uint8x16x3_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
- sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
- sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+#if AOM_ARCH_AARCH64
+ do {
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int width = w;
- return sum;
-}
+ uint8x8_t t0, t1, t2, t3;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
-static INLINE int16x8_t convolve12_8_usdot(uint8x16_t samples0,
- uint8x16_t samples1,
- const int8x16_t filters,
- const uint8x16x3_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[4];
- int32x4_t sum[2];
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples0, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples0, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples0, permute_tbl.val[2]);
- /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
- permuted_samples[3] = vqtbl1q_u8(samples1, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
- sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
- sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
- /* Second 4 output values. */
- sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
- sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
- sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
- /* Narrow and re-pack. */
- return vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
- vqrshrn_n_s32(sum[1], FILTER_BITS));
-}
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
+ load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ s += 11;
+
+ do {
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ int16x4_t d0 =
+ convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d1 =
+ convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d2 =
+ convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d3 =
+ convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+
+ transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+ uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+ store_u8_4x1(d + 0 * dst_stride, d01, 0);
+ store_u8_4x1(d + 1 * dst_stride, d01, 1);
+ store_u8_4x1(d + 2 * dst_stride, d23, 0);
+ store_u8_4x1(d + 3 * dst_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
-static INLINE int16x4_t convolve12_horiz_4_sdot(
- uint8x16_t samples, const int8x16_t filters, const int32x4_t correction,
- const uint8x16_t range_limit, const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum;
+#else // !AOM_ARCH_AARCH64
+ do {
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int width = w;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+ do {
+ uint8x16_t t0 = vld1q_u8(s);
+ int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+ int16x4_t s0 = vget_low_s16(tt0);
+ int16x4_t s4 = vget_high_s16(tt0);
+ int16x4_t s8 = vget_low_s16(tt8);
+ int16x4_t s12 = vget_high_s16(tt8);
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
- sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
- sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
+ int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
+ int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
+ int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
+ int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
+ int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
- /* Narrow and re-pack. */
- return vshrn_n_s32(sum, ROUND0_BITS);
-}
+ int16x4_t d0 =
+ convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
-static INLINE int16x8_t convolve12_horiz_8_sdot(
- uint8x16_t samples0, uint8x16_t samples1, const int8x16_t filters,
- const int32x4_t correction, const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples[2], permuted_samples[4];
- int32x4_t sum[2];
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples0, range_limit));
- clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples1, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
- /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
- permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
- sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
- sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
- /* Second 4 output values. */
- sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
- sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
- sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
- /* Narrow and re-pack. */
- return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
- vshrn_n_s32(sum[1], ROUND0_BITS));
-}
+ uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
-static INLINE int32x4_t convolve12_4_sdot(uint8x16_t samples,
- const int8x16_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
- sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
- sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+ store_u8_4x1(d, dd0, 0);
- return sum;
+ s += 4;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+#endif // AOM_ARCH_AARCH64
}
-static INLINE int16x8_t convolve12_8_sdot(uint8x16_t samples0,
- uint8x16_t samples1,
- const int8x16_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples[2], permuted_samples[4];
- int32x4_t sum[2];
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples0, range_limit));
- clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples1, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
- /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
- permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
- sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
- sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
- /* Second 4 output values. */
- sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
- sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
- sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
- /* Narrow and re-pack. */
- return vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
- vqrshrn_n_s32(sum[1], FILTER_BITS));
-}
+static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t filter,
+ const int16x4_t horiz_const) {
+ int16x4_t sum = horiz_const;
+ sum = vmla_lane_s16(sum, s0, filter, 0);
+ sum = vmla_lane_s16(sum, s1, filter, 1);
+ sum = vmla_lane_s16(sum, s2, filter, 2);
+ sum = vmla_lane_s16(sum, s3, filter, 3);
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+ // We halved the convolution filter values so - 1 from the right shift.
+ return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
+}
-static INLINE uint8x8_t convolve8_vert_8x4(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) {
+static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter,
+ const int16x8_t horiz_const) {
const int16x4_t filter_lo = vget_low_s16(filter);
const int16x4_t filter_hi = vget_high_s16(filter);
- int16x8_t sum;
- sum = vmulq_lane_s16(s0, filter_lo, 0);
+ int16x8_t sum = horiz_const;
+ sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
@@ -277,198 +223,26 @@ static INLINE uint8x8_t convolve8_vert_8x4(
sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+ // We halved the convolution filter values so - 1 from the right shift.
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-static INLINE int16x4_t convolve8_vert_4_s32(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
- int32x4_t sum;
-
- sum = vmull_lane_s16(s0, y_filter_lo, 0);
- sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
- sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
- sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
- sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
- sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
- sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
- sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
-
- return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE uint8x8_t
-convolve8_vert_8_s32(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t y_filter, const int16x8_t sub_const) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
- int32x4_t sum0, sum1;
- int16x8_t res;
-
- sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
-
- sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
-
- res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
- vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
- res = vsubq_s16(res, sub_const);
-
- return vqmovun_s16(res);
-}
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-void convolve_x_sr_12tap_neon(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const int16_t *x_filter_ptr) {
- const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
- const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
- const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
- const int8x16_t filter =
- vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
-
- // Special case the following no-op filter as 128 won't fit into the
- // 8-bit signed dot-product instruction:
- // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
- if (vgetq_lane_s16(filter_0_7, 5) == 128) {
- uint8x8_t d0;
-
- // Undo the horizontal offset in the calling function.
- src += 5;
-
- for (int i = 0; i < h; i++) {
- for (int j = 0; j < w; j += 8) {
- d0 = vld1_u8(src + i * src_stride + j);
- if (w == 2) {
- store_u8_2x1(dst + i * dst_stride, d0, 0);
- } else if (w == 4) {
- store_u8_4x1(dst + i * dst_stride, d0, 0);
- } else {
- vst1_u8(dst + i * dst_stride + j, d0);
- }
- }
- }
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
- // right shift by FILTER_BITS - instead of a first rounding right shift by
- // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
- // ROUND0_BITS.
- const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
-
- if (w <= 4) {
- uint8x16_t s0, s1, s2, s3;
- int32x4_t d0, d1, d2, d3;
- int16x8_t t01, t23;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve12_4_usdot(s0, filter, permute_tbl, horiz_const);
- d1 = convolve12_4_usdot(s1, filter, permute_tbl, horiz_const);
- d2 = convolve12_4_usdot(s2, filter, permute_tbl, horiz_const);
- d3 = convolve12_4_usdot(s3, filter, permute_tbl, horiz_const);
-
- t01 = vcombine_s16(vqrshrn_n_s32(d0, FILTER_BITS),
- vqrshrn_n_s32(d1, FILTER_BITS));
- t23 = vcombine_s16(vqrshrn_n_s32(d2, FILTER_BITS),
- vqrshrn_n_s32(d3, FILTER_BITS));
-
- d01 = vqmovun_s16(t01);
- d23 = vqmovun_s16(t23);
-
- if (w == 2) {
- store_u8_2x1(dst + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(dst + 2 * dst_stride, d23, 0);
- store_u8_2x1(dst + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
- }
- }
-
- dst += 4 * dst_stride;
- src += 4 * src_stride;
- h -= 4;
- } while (h > 0);
- } else {
- uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
- int16x8_t d0, d1, d2, d3;
- uint8x8_t dd0, dd1, dd2, dd3;
-
- do {
- const uint8_t *s = src;
- uint8_t *d = dst;
- int width = w;
-
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- load_u8_16x4(s + 4, src_stride, &s4, &s5, &s6, &s7);
-
- d0 = convolve12_8_usdot(s0, s4, filter, permute_tbl, horiz_const);
- d1 = convolve12_8_usdot(s1, s5, filter, permute_tbl, horiz_const);
- d2 = convolve12_8_usdot(s2, s6, filter, permute_tbl, horiz_const);
- d3 = convolve12_8_usdot(s3, s7, filter, permute_tbl, horiz_const);
-
- dd0 = vqmovun_s16(d0);
- dd1 = vqmovun_s16(d1);
- dd2 = vqmovun_s16(d2);
- dd3 = vqmovun_s16(d3);
-
- store_u8_8x2(d + 0 * dst_stride, dst_stride, dd0, dd1);
- if (h != 2) {
- store_u8_8x2(d + 2 * dst_stride, dst_stride, dd2, dd3);
- }
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- }
- }
-}
-
void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const int subpel_x_qn,
ConvolveParams *conv_params) {
- (void)conv_params;
+ if (w == 2 || h == 2) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ return;
+ }
+
const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+ src -= horiz_offset;
const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
- src -= horiz_offset;
if (filter_params_x->taps > 8) {
convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
@@ -476,1125 +250,509 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
return;
}
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
// FILTER_BITS - ROUND0_BITS.
- // The outermost -1 is needed because we halved the filter values.
- const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
+ // The outermost -1 is needed because we will halve the filter values.
+ const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
if (w <= 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int32x4_t t0, t1, t2, t3;
- int16x8_t t01, t23;
- uint8x8_t d01, d23;
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src += 2;
do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+ uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
- t1 = convolve8_4_usdot(s1, x_filter, permute_tbl, horiz_const);
- t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
- t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- t01 = vcombine_s16(vmovn_s32(t0), vmovn_s32(t1));
- t23 = vcombine_s16(vmovn_s32(t2), vmovn_s32(t3));
+ uint8x8_t d0 =
+ convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const));
- // We halved the convolution filter values so - 1 from the right shift.
- d01 = vqrshrun_n_s16(t01, FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(t23, FILTER_BITS - 1);
+ store_u8_4x1(dst, d0, 0);
- if (w == 2) {
- store_u8_2x1(dst + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(dst + 2 * dst_stride, d23, 0);
- store_u8_2x1(dst + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
- }
- }
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ // Filter values are even so halve to reduce precision requirements.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
- h -= 4;
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- } while (h > 0);
+#if AOM_ARCH_AARCH64
+ while (h >= 8) {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x8_t t0, t1, t2, t3;
- uint8x8_t d0, d1, d2, d3;
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- do {
int width = w;
- const uint8_t *s = src;
+ const uint8_t *s = src + 7;
uint8_t *d = dst;
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_x_8_usdot(s0, x_filter, permute_tbl, horiz_const);
- t1 = convolve8_x_8_usdot(s1, x_filter, permute_tbl, horiz_const);
- t2 = convolve8_x_8_usdot(s2, x_filter, permute_tbl, horiz_const);
- t3 = convolve8_x_8_usdot(s3, x_filter, permute_tbl, horiz_const);
-
- // We halved the convolution filter values so - 1 from the right shift.
- d0 = vqrshrun_n_s16(t0, FILTER_BITS - 1);
- d1 = vqrshrun_n_s16(t1, FILTER_BITS - 1);
- d2 = vqrshrun_n_s16(t2, FILTER_BITS - 1);
- d3 = vqrshrun_n_s16(t3, FILTER_BITS - 1);
-
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- if (h != 2) {
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
- }
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+ load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+ transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
+ &t14);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+
+ uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const);
+ uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ horiz_const);
+ uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ horiz_const);
+ uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ horiz_const);
+ uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+ horiz_const);
+ uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, horiz_const);
+ uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, horiz_const);
+ uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, horiz_const);
+
+ transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void convolve_x_sr_12tap_neon(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const int16_t *x_filter_ptr) {
- const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
- const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
- const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
- const int8x16_t filter =
- vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
-
- const int32x4_t correct_tmp =
- vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, 7)),
- vpaddlq_s16(vshlq_n_s16(filter_8_15, 7)));
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
- // right shift by FILTER_BITS - instead of a first rounding right shift by
- // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
- // ROUND0_BITS.
- int32x4_t correction =
- vdupq_n_s32(vaddvq_s32(correct_tmp) + (1 << (ROUND0_BITS - 1)));
- const uint8x16_t range_limit = vdupq_n_u8(128);
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- // Special case the following no-op filter as 128 won't fit into the
- // 8-bit signed dot-product instruction:
- // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
- if (vgetq_lane_s16(filter_0_7, 5) == 128) {
- uint8x8_t d0;
-
- // Undo the horizontal offset in the calling function.
- src += 5;
-
- for (int i = 0; i < h; i++) {
- for (int j = 0; j < w; j += 8) {
- d0 = vld1_u8(src + i * src_stride + j);
- if (w == 2) {
- store_u8_2x1(dst + i * dst_stride, d0, 0);
- } else if (w == 4) {
- store_u8_4x1(dst + i * dst_stride, d0, 0);
- } else {
- vst1_u8(dst + i * dst_stride + j, d0);
- }
- }
- }
- } else {
- if (w <= 4) {
- uint8x16_t s0, s1, s2, s3;
- int32x4_t d0, d1, d2, d3;
- int16x8_t t01, t23;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 =
- convolve12_4_sdot(s0, filter, correction, range_limit, permute_tbl);
- d1 =
- convolve12_4_sdot(s1, filter, correction, range_limit, permute_tbl);
- d2 =
- convolve12_4_sdot(s2, filter, correction, range_limit, permute_tbl);
- d3 =
- convolve12_4_sdot(s3, filter, correction, range_limit, permute_tbl);
-
- t01 = vcombine_s16(vqrshrn_n_s32(d0, FILTER_BITS),
- vqrshrn_n_s32(d1, FILTER_BITS));
- t23 = vcombine_s16(vqrshrn_n_s32(d2, FILTER_BITS),
- vqrshrn_n_s32(d3, FILTER_BITS));
-
- d01 = vqmovun_s16(t01);
- d23 = vqmovun_s16(t23);
-
- if (w == 2) {
- store_u8_2x1(dst + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(dst + 2 * dst_stride, d23, 0);
- store_u8_2x1(dst + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
- }
- }
-
- dst += 4 * dst_stride;
- src += 4 * src_stride;
- h -= 4;
- } while (h > 0);
- } else {
- uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
- int16x8_t d0, d1, d2, d3;
- uint8x8_t dd0, dd1, dd2, dd3;
-
- do {
- const uint8_t *s = src;
- uint8_t *d = dst;
- int width = w;
-
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- load_u8_16x4(s + 4, src_stride, &s4, &s5, &s6, &s7);
-
- d0 = convolve12_8_sdot(s0, s4, filter, correction, range_limit,
- permute_tbl);
- d1 = convolve12_8_sdot(s1, s5, filter, correction, range_limit,
- permute_tbl);
- d2 = convolve12_8_sdot(s2, s6, filter, correction, range_limit,
- permute_tbl);
- d3 = convolve12_8_sdot(s3, s7, filter, correction, range_limit,
- permute_tbl);
-
- dd0 = vqmovun_s16(d0);
- dd1 = vqmovun_s16(d1);
- dd2 = vqmovun_s16(d2);
- dd3 = vqmovun_s16(d3);
-
- store_u8_8x2(d + 0 * dst_stride, dst_stride, dd0, dd1);
- if (h != 2) {
- store_u8_8x2(d + 2 * dst_stride, dst_stride, dd2, dd3);
- }
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
}
- }
-}
-
-void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const int subpel_x_qn,
- ConvolveParams *conv_params) {
- (void)conv_params;
- const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
-
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- src -= horiz_offset;
-
- if (filter_params_x->taps > 8) {
- convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
- x_filter_ptr);
- return;
- }
-
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
- // Dot product constants.
- const int16x8_t correct_tmp = vshll_n_s8(x_filter, 7);
- // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
- // rounding right shift by FILTER_BITS - instead of a first rounding right
- // shift by ROUND0_BITS, followed by second rounding right shift by
- // FILTER_BITS - ROUND0_BITS.
- // The outermost -1 is needed because we halved the filter values.
- const int32x4_t correction =
- vdupq_n_s32(vaddlvq_s16(correct_tmp) + (1 << ((ROUND0_BITS - 1) - 1)));
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
- if (w <= 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int32x4_t t0, t1, t2, t3;
- int16x8_t t01, t23;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
- t1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
- t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
- t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
-
- t01 = vcombine_s16(vmovn_s32(t0), vmovn_s32(t1));
- t23 = vcombine_s16(vmovn_s32(t2), vmovn_s32(t3));
-
- // We halved the convolution filter values so - 1 from the right shift.
- d01 = vqrshrun_n_s16(t01, FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(t23, FILTER_BITS - 1);
-
- if (w == 2) {
- store_u8_2x1(dst + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(dst + 2 * dst_stride, d23, 0);
- store_u8_2x1(dst + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
- }
- }
+#endif // AOM_ARCH_AARCH64
- h -= 4;
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- } while (h > 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x8_t t0, t1, t2, t3;
- uint8x8_t d0, d1, d2, d3;
+ while (h-- != 0) {
+ uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- do {
int width = w;
- const uint8_t *s = src;
+ const uint8_t *s = src + 8;
uint8_t *d = dst;
+ __builtin_prefetch(d);
+
do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_x_8_sdot(s0, x_filter, correction, range_limit,
- permute_tbl);
- t1 = convolve8_x_8_sdot(s1, x_filter, correction, range_limit,
- permute_tbl);
- t2 = convolve8_x_8_sdot(s2, x_filter, correction, range_limit,
- permute_tbl);
- t3 = convolve8_x_8_sdot(s3, x_filter, correction, range_limit,
- permute_tbl);
-
- // We halved the convolution filter values so - 1 from the right shift.
- d0 = vqrshrun_n_s16(t0, FILTER_BITS - 1);
- d1 = vqrshrun_n_s16(t1, FILTER_BITS - 1);
- d2 = vqrshrun_n_s16(t2, FILTER_BITS - 1);
- d3 = vqrshrun_n_s16(t3, FILTER_BITS - 1);
-
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- if (h != 2) {
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
- }
+ uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const);
+
+ vst1_u8(d, d0);
+ s0 = s8;
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
+ } while (width != 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
}
}
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
+static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter_0_7) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-static INLINE uint8x8_t
-convolve8_horiz_8x8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t filter, const int16x8_t horiz_const) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x8_t sum = horiz_const;
+ // Filter values at indices 0 and 7 are 0.
+ int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
- sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
- sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
- sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
- sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
- sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
- sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
- sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
- sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-
- // We halved the convolution filter values so - 1 from the right shift.
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+ return sum;
}
-static INLINE int16x4_t convolve12_x_4x4_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
- const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
- const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
- const int32x4_t horiz_const) {
- const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
- const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
- int32x4_t sum = horiz_const;
-
- sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
- sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
- sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
- sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
- sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
- sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
- sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
- sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
- sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
- sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
- sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
- sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
-
- return vqrshrn_n_s32(sum, FILTER_BITS);
+static INLINE uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filters) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filters);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filters);
+
+ // Filter values at indices 0 and 7 are 0.
+ int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
+ // We halved the convolution filter values so -1 from the right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-// 4 column per iteration filtering for 12-tap convolve_x_sr.
-// Processes one row at a time.
-static INLINE void x_filter_12tap_w4_single_row(
- const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr,
- const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
- const int16x4_t x_filter_8_11) {
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single
- // rounding right shift by FILTER_BITS - instead of a first rounding right
- // shift by ROUND0_BITS, followed by second rounding right shift by
- // FILTER_BITS - ROUND0_BITS.
- const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
+static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
+ int src_stride, uint8_t *dst_ptr,
+ const int dst_stride, int w, int h,
+ const int16x8_t y_filter) {
+ if (w <= 4) {
+ uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
+
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
- do {
- const uint8_t *s = src_ptr;
- uint8_t *d = dst_ptr;
- int width = w;
+ src_ptr += 5 * src_stride;
do {
- uint8x8_t dd0;
- uint8x16_t t0;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, d0;
- int16x8_t tt0, tt1;
-
- t0 = vld1q_u8(s);
- tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
- tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
-
- s0 = vget_low_s16(tt0);
- s4 = vget_high_s16(tt0);
- s8 = vget_low_s16(tt1);
- s12 = vget_high_s16(tt1);
-
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
- s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
- s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
- s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
-
- d0 = convolve12_x_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, x_filter_0_7, x_filter_8_11, horiz_const);
-
- dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
-
- if (w == 2) {
- store_u8_2x1(d, dd0, 0);
- } else {
- store_u8_4x1(d, dd0, 0);
- }
-
- s += 4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- } while (--h != 0);
-}
-
-static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
- int src_stride, uint8_t *dst_ptr,
- const int dst_stride, int w, int h,
- const int16_t *x_filter_ptr) {
- const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
- const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
-
#if AOM_ARCH_AARCH64
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single
- // rounding right shift by FILTER_BITS - instead of a first rounding right
- // shift by ROUND0_BITS, followed by second rounding right shift by
- // FILTER_BITS - ROUND0_BITS.
- const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
-
- do {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x8_t t0, t1, t2, t3;
+ uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
- const uint8_t *s = src_ptr;
- uint8_t *d = dst_ptr;
- int width = w;
-
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- s += 11;
+ int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
+ int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter);
+ int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter);
- do {
- int16x4_t s11, s12, s13, s14, d0, d1, d2, d3;
- int16x8_t d01, d23;
- uint8x8_t dd01, dd23;
-
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve12_x_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, x_filter_0_7, x_filter_8_11, horiz_const);
- d1 = convolve12_x_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, x_filter_0_7, x_filter_8_11, horiz_const);
- d2 = convolve12_x_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, x_filter_0_7, x_filter_8_11, horiz_const);
- d3 = convolve12_x_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
- s14, x_filter_0_7, x_filter_8_11, horiz_const);
-
- transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-
- d01 = vcombine_s16(d0, d1);
- d23 = vcombine_s16(d2, d3);
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
- dd01 = vqmovun_s16(d01);
- dd23 = vqmovun_s16(d23);
-
- if (w == 2) {
- store_u8_2x1(d + 0 * dst_stride, dd01, 0);
- store_u8_2x1(d + 1 * dst_stride, dd01, 2);
- if (h != 2) {
- store_u8_2x1(d + 2 * dst_stride, dd23, 0);
- store_u8_2x1(d + 3 * dst_stride, dd23, 2);
- }
- } else {
- store_u8_4x1(d + 0 * dst_stride, dd01, 0);
- store_u8_4x1(d + 1 * dst_stride, dd01, 1);
- if (h != 2) {
- store_u8_4x1(d + 2 * dst_stride, dd23, 0);
- store_u8_4x1(d + 3 * dst_stride, dd23, 1);
- }
- }
+ store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
s0 = s4;
s1 = s5;
s2 = s6;
s3 = s7;
s4 = s8;
- s5 = s9;
- s6 = s10;
- s7 = s11;
- s8 = s12;
- s9 = s13;
- s10 = s14;
- s += 4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
- } while (h >= 4);
-
- if (h > 0) {
- x_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w, h,
- x_filter_0_7, x_filter_8_11);
- }
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
#else // !AOM_ARCH_AARCH64
- x_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w, h,
- x_filter_0_7, x_filter_8_11);
-#endif // AOM_ARCH_AARCH64
-}
+ uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr);
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
-void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const int subpel_x_qn,
- ConvolveParams *conv_params) {
- (void)conv_params;
- const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
-
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- src -= horiz_offset;
+ int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 =
+ vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
- if (filter_params_x->taps > 8) {
- convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
- x_filter_ptr);
- return;
- }
+ store_u8_4x1(dst_ptr, d01, 0);
- uint8x8_t t0;
-#if AOM_ARCH_AARCH64
- uint8x8_t t1, t2, t3;
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
- // rounding right shift by FILTER_BITS - instead of a first rounding right
- // shift by ROUND0_BITS, followed by second rounding right shift by
- // FILTER_BITS - ROUND0_BITS.
- // The outermost -1 is needed because we halved the filter values.
- const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
#endif // AOM_ARCH_AARCH64
- // Filter values are even so downshift by 1 to reduce precision requirements.
- const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-#if AOM_ARCH_AARCH64
- if (h == 4) {
- uint8x8_t d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- int16x8_t d01_temp, d23_temp;
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
-
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- src += 7;
+ } while (h != 0);
+ } else {
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int height = h;
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
- d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
- d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
- d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
- d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- d01_temp = vcombine_s16(d0, d1);
- d23_temp = vcombine_s16(d2, d3);
+ s += 5 * src_stride;
- d01_temp = vaddq_s16(d01_temp, horiz_const);
- d23_temp = vaddq_s16(d23_temp, horiz_const);
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7, t8;
+ load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
- // We halved the convolution filter values so - 1 from the right shift.
- d01 = vqrshrun_n_s16(d01_temp, FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(d23_temp, FILTER_BITS - 1);
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
- transpose_u8_4x4(&d01, &d23);
+ uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
+ uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter);
+ uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter);
+ uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter);
- if (w == 2) {
- store_u8_2x1(dst + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst + 1 * dst_stride, d23, 0);
- store_u8_2x1(dst + 2 * dst_stride, d01, 2);
- store_u8_2x1(dst + 3 * dst_stride, d23, 2);
- } else {
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d23, 0);
- store_u8_4x1(dst + 2 * dst_stride, d01, 1);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
- }
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- src += 4;
- dst += 4;
- w -= 4;
- } while (w > 0);
- } else {
-#endif // AOM_ARCH_AARCH64
- int width;
- const uint8_t *s;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10;
- uint8x8_t t4, t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- if (w <= 4) {
-#if AOM_ARCH_AARCH64
- do {
- load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
- &t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
-
- transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const);
- t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const);
- t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const);
-
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- if (w == 4) {
- store_u8_4x1(dst + 0 * dst_stride, t0, 0);
- store_u8_4x1(dst + 1 * dst_stride, t1, 0);
- if (h > 4) {
- store_u8_4x1(dst + 2 * dst_stride, t2, 0);
- store_u8_4x1(dst + 3 * dst_stride, t3, 0);
- store_u8_4x1(dst + 4 * dst_stride, t0, 1);
- store_u8_4x1(dst + 5 * dst_stride, t1, 1);
- store_u8_4x1(dst + 6 * dst_stride, t2, 1);
- store_u8_4x1(dst + 7 * dst_stride, t3, 1);
- }
- } else if (w == 2) {
- store_u8_2x1(dst + 0 * dst_stride, t0, 0);
- store_u8_2x1(dst + 1 * dst_stride, t1, 0);
- if (h > 4) {
- store_u8_2x1(dst + 2 * dst_stride, t2, 0);
- store_u8_2x1(dst + 3 * dst_stride, t3, 0);
- store_u8_2x1(dst + 4 * dst_stride, t0, 2);
- store_u8_2x1(dst + 5 * dst_stride, t1, 2);
- store_u8_2x1(dst + 6 * dst_stride, t2, 2);
- store_u8_2x1(dst + 7 * dst_stride, t3, 2);
- }
- }
-
- dst += 8 * dst_stride;
- h -= 8;
- } while (h > 0);
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
#else // !AOM_ARCH_AARCH64
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
- // rounding right shift by FILTER_BITS - instead of a first rounding right
- // shift by ROUND0_BITS, followed by second rounding right shift by
- // FILTER_BITS - ROUND0_BITS.
- // The outermost -1 is needed because we halved the filter values.
- const int16x4_t horiz_const = vdup_n_s16(1 << ((ROUND0_BITS - 1) - 1));
- int16x8_t tt0;
- int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- do {
- t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
- tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- x0 = vget_low_s16(tt0); // a0 a1 a2 a3
- x4 = vget_high_s16(tt0); // a4 a5 a6 a7
-
- t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15
- tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- x7 = vget_low_s16(tt0); // a8 a9 a10 a11
-
- x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4
- x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5
- x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6
- x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8
- x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9
- x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10
+ uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
- src += src_stride;
-
- t0 = convolve8_x_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
- horiz_const);
+ vst1_u8(d, d0);
- if (w == 4) {
- store_u8_4x1(dst, t0, 0);
- dst += dst_stride;
- } else if (w == 2) {
- store_u8_2x1(dst, t0, 0);
- dst += dst_stride;
- }
- h -= 1;
- } while (h > 0);
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
#endif // AOM_ARCH_AARCH64
- } else {
- uint8_t *d;
- int16x8_t s11;
-#if AOM_ARCH_AARCH64
- int16x8_t s12, s13, s14;
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
-
- do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const);
- t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const);
- t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const);
- t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- horiz_const);
- t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
- horiz_const);
- t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
- horiz_const);
- t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
- x_filter, horiz_const);
-
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- if (h != 2) {
- store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
- } else {
- store_u8_8x2(d, dst_stride, t0, t1);
- }
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 8 * src_stride;
- dst += 8 * dst_stride;
- h -= 8;
- } while (h > 0);
-#else // !AOM_ARCH_AARCH64
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
- // rounding right shift by FILTER_BITS - instead of a first rounding right
- // shift by ROUND0_BITS, followed by second rounding right shift by
- // FILTER_BITS - ROUND0_BITS.
- // The outermost -1 is needed because we halved the filter values.
- const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
-
- do {
- t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
- width = w;
- s = src + 8;
- d = dst;
- __builtin_prefetch(dst);
+static INLINE int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s11 = s0;
- s0 = s7;
+ int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
- s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+ return sum;
+}
- t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
+static INLINE uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
- vst1_u8(d, t0);
+ int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += src_stride;
- dst += dst_stride;
- h -= 1;
- } while (h > 0);
-#endif // AOM_ARCH_AARCH64
- }
-#if AOM_ARCH_AARCH64
- }
-#endif // AOM_ARCH_AARCH64
+ // We halved the convolution filter values so -1 from the right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
+static INLINE void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
int src_stride, uint8_t *dst_ptr,
const int dst_stride, int w, int h,
- const int16x8_t y_filter_0_7) {
+ const int16x8_t y_filter) {
if (w <= 4) {
- uint8x8_t t0, t1, t2, t3, t4, t5;
- int16x4_t s0, s1, s2, s3, s4, s5, d0;
- uint8x8_t d01;
-
-#if AOM_ARCH_AARCH64
- uint8x8_t t6, t7, t8;
- int16x4_t s6, s7, s8, d1, d2, d3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- const uint8_t *s = src_ptr + src_stride;
- uint8_t *d = dst_ptr;
-
- load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
- s += 5 * src_stride;
+ uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ src_ptr += 7 * src_stride;
do {
#if AOM_ARCH_AARCH64
- load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
- s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
- s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+ uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
- d0 = convolve6_4x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
- d1 = convolve6_4x4(s1, s2, s3, s4, s5, s6, y_filter_0_7);
- d2 = convolve6_4x4(s2, s3, s4, s5, s6, s7, y_filter_0_7);
- d3 = convolve6_4x4(s3, s4, s5, s6, s7, s8, y_filter_0_7);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+ int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
- if (w == 2) {
- store_u8_2x1(d + 0 * dst_stride, d01, 0);
- store_u8_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(d + 2 * dst_stride, d23, 0);
- store_u8_2x1(d + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(d + 0 * dst_stride, d01, 0);
- store_u8_4x1(d + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(d + 2 * dst_stride, d23, 0);
- store_u8_4x1(d + 3 * dst_stride, d23, 1);
- }
- }
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
s0 = s4;
s1 = s5;
s2 = s6;
s3 = s7;
s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
h -= 4;
#else // !AOM_ARCH_AARCH64
- t5 = vld1_u8(s);
- s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
- d0 = convolve6_4x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
+ int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 =
+ vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
- if (w == 2) {
- store_u8_2x1(d, d01, 0);
- } else {
- store_u8_4x1(d, d01, 0);
- }
+ store_u8_4x1(dst_ptr, d01, 0);
s0 = s1;
s1 = s2;
s2 = s3;
s3 = s4;
s4 = s5;
- s += src_stride;
- d += dst_stride;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
h--;
#endif // AOM_ARCH_AARCH64
- } while (h > 0);
+ } while (h != 0);
} else {
- // if width is a multiple of 8 & height is a multiple of 4
- uint8x8_t t0, t1, t2, t3, t4, t5;
- int16x8_t s0, s1, s2, s3, s4, s5, dd0;
- uint8x8_t d0;
-#if AOM_ARCH_AARCH64
- uint8x8_t t6, t7, t8;
- int16x8_t s6, s7, s8, dd1, dd2, dd3;
- uint8x8_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
do {
- int height = h;
- const uint8_t *s = src_ptr + src_stride;
+ const uint8_t *s = src_ptr;
uint8_t *d = dst_ptr;
+ int height = h;
- load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s += 5 * src_stride;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
do {
#if AOM_ARCH_AARCH64
- load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
-
- dd0 = convolve6_8x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
- dd1 = convolve6_8x4(s1, s2, s3, s4, s5, s6, y_filter_0_7);
- dd2 = convolve6_8x4(s2, s3, s4, s5, s6, s7, y_filter_0_7);
- dd3 = convolve6_8x4(s3, s4, s5, s6, s7, s8, y_filter_0_7);
-
- d0 = vqrshrun_n_s16(dd0, FILTER_BITS - 1);
- d1 = vqrshrun_n_s16(dd1, FILTER_BITS - 1);
- d2 = vqrshrun_n_s16(dd2, FILTER_BITS - 1);
- d3 = vqrshrun_n_s16(dd3, FILTER_BITS - 1);
-
- if (h != 2) {
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- } else {
- store_u8_8x2(d, dst_stride, d0, d1);
- }
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+ uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
s2 = s6;
s3 = s7;
s4 = s8;
+ s5 = s9;
+ s6 = s10;
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
#else // !AOM_ARCH_AARCH64
- t5 = vld1_u8(s);
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- dd0 = convolve6_8x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
- d0 = vqrshrun_n_s16(dd0, FILTER_BITS - 1);
+ uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
vst1_u8(d, d0);
@@ -1603,25 +761,28 @@ static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
s2 = s3;
s3 = s4;
s4 = s5;
+ s5 = s6;
+ s6 = s7;
s += src_stride;
d += dst_stride;
height--;
#endif // AOM_ARCH_AARCH64
- } while (height > 0);
-
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
-static INLINE int16x4_t convolve12_y_4x4_s32(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
- const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+static INLINE int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x4_t s8, const int16x4_t s9,
+ const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7,
+ const int16x4_t y_filter_8_11) {
const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
int16x4_t sum;
@@ -1638,22 +799,22 @@ static INLINE int16x4_t convolve12_y_4x4_s32(
sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2);
sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3);
- // Separate out the two filter values in the middle of the kernel that have
- // the largest magnitude and use saturating addition to prevent overflow. This
- // means we can stay at 16-bit elements, rather than having to widen
- // everything to a 32-bit result, requiring twice the number of instructions.
+ // Saturating addition is required for the largest filter taps to avoid
+ // overflow (while staying in 16-bit elements.)
sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1));
sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2));
return sum;
}
-static INLINE uint8x8_t convolve12_y_8x4_s32(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
- const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+static INLINE uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t s8, const int16x8_t s9,
+ const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7,
+ const int16x4_t y_filter_8_11) {
const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
int16x8_t sum;
@@ -1670,10 +831,8 @@ static INLINE uint8x8_t convolve12_y_8x4_s32(
sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2);
sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3);
- // Separate out the two filter values in the middle of the kernel that have
- // the largest magnitude and use saturating addition to prevent overflow. This
- // means we can stay at 16-bit elements, rather than having to widen
- // everything to a 32-bit result, requiring twice the number of instructions.
+ // Saturating addition is required for the largest filter taps to avoid
+ // overflow (while staying in 16-bit elements.)
sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1));
sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2));
@@ -1684,98 +843,52 @@ static INLINE void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
int src_stride, uint8_t *dst_ptr,
int dst_stride, int w, int h,
const int16_t *y_filter_ptr) {
- // Special case the following no-op filter as 128 won't fit into the
- // 8-bit signed dot-product instruction:
- // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
- if (y_filter_ptr[5] == 128) {
- // Undo the horizontal offset in the calling function
- src_ptr += 5 * src_stride;
-
- if (w <= 4) {
- for (int i = 0; i < h; i += 2) {
- uint8x8_t d0 = load_unaligned_u8(src_ptr + i * src_stride, src_stride);
- if (w == 2) {
- store_u8_2x1(dst_ptr + i * dst_stride, d0, 0);
- store_u8_2x1(dst_ptr + (i + 1) * dst_stride, d0, 1);
- } else if (w == 4) {
- store_u8_4x1(dst_ptr + i * dst_stride, d0, 0);
- store_u8_4x1(dst_ptr + (i + 1) * dst_stride, d0, 1);
- }
- }
- } else {
- for (int i = 0; i < h; i++) {
- for (int j = 0; j < w; j += 8) {
- uint8x8_t d0 = vld1_u8(src_ptr + i * src_stride + j);
- vst1_u8(dst_ptr + i * dst_stride + j, d0);
- }
- }
- }
- return;
- }
-
const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
if (w <= 4) {
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
- int16x4_t d0, d1, d2, d3;
- int16x8_t dd01, dd23;
- uint8x8_t d01, d23;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
&t8, &t9, &t10);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
- s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
- s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
src_ptr += 11 * src_stride;
do {
+ uint8x8_t t11, t12, t13, t14;
load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14);
- s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
- s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
- s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
- s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
-
- d0 = convolve12_y_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, y_filter_0_7, y_filter_8_11);
- d1 = convolve12_y_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, y_filter_0_7, y_filter_8_11);
- d2 = convolve12_y_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, y_filter_0_7, y_filter_8_11);
- d3 = convolve12_y_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
- s14, y_filter_0_7, y_filter_8_11);
-
- dd01 = vcombine_s16(d0, d1);
- dd23 = vcombine_s16(d2, d3);
-
- d01 = vqrshrun_n_s16(dd01, FILTER_BITS);
- d23 = vqrshrun_n_s16(dd23, FILTER_BITS);
- if (w == 2) {
- store_u8_2x1(dst_ptr + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst_ptr + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(dst_ptr + 2 * dst_stride, d23, 0);
- store_u8_2x1(dst_ptr + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
- }
- }
+ int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
+ int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
+ int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
+ int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
+
+ int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, y_filter_0_7, y_filter_8_11);
+ int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, s12, y_filter_0_7, y_filter_8_11);
+ int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, s13, y_filter_0_7, y_filter_8_11);
+ int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, y_filter_0_7, y_filter_8_11);
+
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
s0 = s4;
s1 = s5;
@@ -1791,54 +904,50 @@ static INLINE void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
- } while (h > 0);
- } else {
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
- uint8x8_t d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+ } while (h != 0);
+ } else {
do {
const uint8_t *s = src_ptr;
uint8_t *d = dst_ptr;
int height = h;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
&t9, &t10);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
s += 11 * src_stride;
do {
+ uint8x8_t t11, t12, t13, t14;
load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14);
- s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
-
- d0 = convolve12_y_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, y_filter_0_7, y_filter_8_11);
- d1 = convolve12_y_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, y_filter_0_7, y_filter_8_11);
- d2 = convolve12_y_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, y_filter_0_7, y_filter_8_11);
- d3 = convolve12_y_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, s14, y_filter_0_7, y_filter_8_11);
-
- if (h != 2) {
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- } else {
- store_u8_8x2(d, dst_stride, d0, d1);
- }
+
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+
+ uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+ s10, s11, y_filter_0_7, y_filter_8_11);
+ uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, s12, y_filter_0_7, y_filter_8_11);
+ uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, s13, y_filter_0_7, y_filter_8_11);
+ uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, y_filter_0_7, y_filter_8_11);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -1854,12 +963,11 @@ static INLINE void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
-
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
@@ -1867,8 +975,15 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_y,
const int subpel_y_qn) {
+ if (w == 2 || h == 2) {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+ subpel_y_qn);
+ return;
+ }
+
const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
- const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int vert_offset = clamped_y_taps / 2 - 1;
src -= vert_offset * src_stride;
@@ -1881,635 +996,27 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
return;
}
- // Filter values are even so downshift by 1 to reduce precision requirements.
+ // Filter values are even so halve to reduce precision requirements.
const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
if (y_filter_taps < 8) {
convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
- return;
- }
-
- if (w <= 4) {
- uint8x8_t d01;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-#if AOM_ARCH_AARCH64
- uint8x8_t d23;
- int16x4_t s8, s9, s10, d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
-
- do {
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
-#if AOM_ARCH_AARCH64
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
- src += src_stride;
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
- d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
- d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
- d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- if (w == 2) {
- store_u8_2x1(dst + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(dst + 2 * dst_stride, d23, 0);
- store_u8_2x1(dst + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
- }
- }
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- dst += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
-
- d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS - 1);
-
- if (w == 4) {
- store_u8_4x1(dst, d01, 0);
- } else if (w == 2) {
- store_u8_2x1(dst, d01, 0);
- }
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- dst += dst_stride;
- h -= 1;
-#endif // AOM_ARCH_AARCH64
- } while (h > 0);
} else {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t t0;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-#if AOM_ARCH_AARCH64
- uint8x8_t t1, t2, t3;
- int16x8_t s8, s9, s10;
-#endif // AOM_ARCH_AARCH64
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- s = src;
- s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- d = dst;
- height = h;
-
- do {
- s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
-#if AOM_ARCH_AARCH64
- s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
- s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- s += src_stride;
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
- t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
- t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
- t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
- if (h != 2) {
- store_u8_8x4(d, dst_stride, t0, t1, t2, t3);
- } else {
- store_u8_8x2(d, dst_stride, t0, t1);
- }
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- __builtin_prefetch(d);
- __builtin_prefetch(s);
-
- t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-
- vst1_u8(d, t0);
- d += dst_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- height -= 1;
-#endif // AOM_ARCH_AARCH64
- } while (height > 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w > 0);
+ convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
}
}
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve12_horiz_4_usdot(uint8x16_t samples,
- const int8x16_t filters,
- const uint8x16x3_t permute_tbl,
- int32x4_t horiz_const) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
- sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
- sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
-
- /* Narrow and re-pack. */
- return vshrn_n_s32(sum, ROUND0_BITS);
-}
-
-static INLINE int16x8_t convolve12_horiz_8_usdot(uint8x16_t samples0,
- uint8x16_t samples1,
- const int8x16_t filters,
- const uint8x16x3_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[4];
- int32x4_t sum[2];
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples0, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples0, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples0, permute_tbl.val[2]);
- /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
- permuted_samples[3] = vqtbl1q_u8(samples1, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
- sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
- sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
- /* Second 4 output values. */
- sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
- sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
- sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
- /* Narrow and re-pack. */
- return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
- vshrn_n_s32(sum[1], ROUND0_BITS));
-}
-
-static INLINE void convolve_2d_sr_horiz_12tap_neon(
- const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
- const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
- const int16x4_t x_filter_8_11) {
- const int bd = 8;
-
- // Special case the following no-op filter as 128 won't fit into the
- // 8-bit signed dot-product instruction:
- // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
- if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
- const int16x8_t horiz_const = vdupq_n_s16((1 << (bd - 1)));
- // Undo the horizontal offset in the calling function.
- src_ptr += 5;
-
- for (int i = 0; i < h; i++) {
- for (int j = 0; j < w; j += 8) {
- uint8x8_t s0 = vld1_u8(src_ptr + i * src_stride + j);
- uint16x8_t t0 = vaddw_u8(vreinterpretq_u16_s16(horiz_const), s0);
- int16x8_t d0 =
- vshlq_n_s16(vreinterpretq_s16_u16(t0), FILTER_BITS - ROUND0_BITS);
- if (w == 2) {
- store_s16_2x1(dst_ptr + i * dst_stride, vget_low_s16(d0), 0);
- } else if (w == 4) {
- vst1_s16(dst_ptr + i * dst_stride, vget_low_s16(d0));
- } else {
- vst1q_s16(dst_ptr + i * dst_stride + j, d0);
- }
- }
- }
- } else {
- // Narrow filter values to 8-bit.
- const int16x8x2_t x_filter_s16 = {
- { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
- };
- const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
- vmovn_s16(x_filter_s16.val[1]));
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
- // - which are generally faster than rounding shifts on modern CPUs.
- const int32x4_t horiz_const =
- vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- if (w <= 4) {
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- int16x4_t d0, d1, d2, d3;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve12_horiz_4_usdot(s0, x_filter, permute_tbl, horiz_const);
- d1 = convolve12_horiz_4_usdot(s1, x_filter, permute_tbl, horiz_const);
- d2 = convolve12_horiz_4_usdot(s2, x_filter, permute_tbl, horiz_const);
- d3 = convolve12_horiz_4_usdot(s3, x_filter, permute_tbl, horiz_const);
-
- if (w == 2) {
- store_s16_2x1(d + 0 * dst_stride, d0, 0);
- store_s16_2x1(d + 1 * dst_stride, d1, 0);
- store_s16_2x1(d + 2 * dst_stride, d2, 0);
- store_s16_2x1(d + 3 * dst_stride, d3, 0);
- } else {
- store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
- }
-
- s += 4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
- } while (h >= 4);
-
- for (; h > 0; h--) {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0;
- int16x4_t d0;
-
- s0 = vld1q_u8(s);
-
- d0 = convolve12_horiz_4_usdot(s0, x_filter, permute_tbl, horiz_const);
-
- if (w == 2) {
- store_s16_2x1(d, d0, 0);
- } else {
- vst1_s16(d, d0);
- }
-
- s += 4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
- } else {
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0[2], s1[2], s2[2], s3[2];
- int16x8_t d0, d1, d2, d3;
-
- load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
- load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
- d0 = convolve12_horiz_8_usdot(s0[0], s0[1], x_filter, permute_tbl,
- horiz_const);
- d1 = convolve12_horiz_8_usdot(s1[0], s1[1], x_filter, permute_tbl,
- horiz_const);
- d2 = convolve12_horiz_8_usdot(s2[0], s2[1], x_filter, permute_tbl,
- horiz_const);
- d3 = convolve12_horiz_8_usdot(s3[0], s3[1], x_filter, permute_tbl,
- horiz_const);
-
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
- } while (h >= 4);
-
- for (; h > 0; h--) {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0[2];
- int16x8_t d0;
-
- s0[0] = vld1q_u8(s);
- s0[1] = vld1q_u8(s + 4);
-
- d0 = convolve12_horiz_8_usdot(s0[0], s0[1], x_filter, permute_tbl,
- horiz_const);
-
- vst1q_s16(d, d0);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
- }
- }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void convolve_2d_sr_horiz_12tap_neon(
- const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
- const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
- const int16x4_t x_filter_8_11) {
- const int bd = 8;
-
- // Special case the following no-op filter as 128 won't fit into the
- // 8-bit signed dot-product instruction:
- // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
- if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
- const int16x8_t horiz_const = vdupq_n_s16((1 << (bd - 1)));
- // Undo the horizontal offset in the calling function.
- src_ptr += 5;
-
- for (int i = 0; i < h; i++) {
- for (int j = 0; j < w; j += 8) {
- uint8x8_t s0 = vld1_u8(src_ptr + i * src_stride + j);
- uint16x8_t t0 = vaddw_u8(vreinterpretq_u16_s16(horiz_const), s0);
- int16x8_t d0 =
- vshlq_n_s16(vreinterpretq_s16_u16(t0), FILTER_BITS - ROUND0_BITS);
- if (w == 2) {
- store_s16_2x1(dst_ptr + i * dst_stride, vget_low_s16(d0), 0);
- } else if (w == 4) {
- vst1_s16(dst_ptr + i * dst_stride, vget_low_s16(d0));
- } else {
- vst1q_s16(dst_ptr + i * dst_stride + j, d0);
- }
- }
- }
- } else {
- // Narrow filter values to 8-bit.
- const int16x8x2_t x_filter_s16 = {
- { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
- };
- const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
- vmovn_s16(x_filter_s16.val[1]));
-
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
- // - which are generally faster than rounding shifts on modern CPUs.
- const int32_t horiz_const =
- ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
- // Dot product constants.
- const int32x4_t correct_tmp =
- vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
- vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
- const int32x4_t correction =
- vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
- const uint8x16_t range_limit = vdupq_n_u8(128);
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- if (w <= 4) {
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- int16x4_t d0, d1, d2, d3;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve12_horiz_4_sdot(s0, x_filter, correction, range_limit,
- permute_tbl);
- d1 = convolve12_horiz_4_sdot(s1, x_filter, correction, range_limit,
- permute_tbl);
- d2 = convolve12_horiz_4_sdot(s2, x_filter, correction, range_limit,
- permute_tbl);
- d3 = convolve12_horiz_4_sdot(s3, x_filter, correction, range_limit,
- permute_tbl);
-
- if (w == 2) {
- store_s16_2x1(d + 0 * dst_stride, d0, 0);
- store_s16_2x1(d + 1 * dst_stride, d1, 0);
- store_s16_2x1(d + 2 * dst_stride, d2, 0);
- store_s16_2x1(d + 3 * dst_stride, d3, 0);
- } else {
- store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
- }
-
- s += 4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
- } while (h >= 4);
-
- for (; h > 0; h--) {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0;
- int16x4_t d0;
-
- s0 = vld1q_u8(s);
-
- d0 = convolve12_horiz_4_sdot(s0, x_filter, correction, range_limit,
- permute_tbl);
-
- if (w == 2) {
- store_s16_2x1(d, d0, 0);
- } else {
- vst1_s16(d, d0);
- }
-
- s += 4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
- } else {
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0[2], s1[2], s2[2], s3[2];
- int16x8_t d0, d1, d2, d3;
-
- load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
- load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
- d0 = convolve12_horiz_8_sdot(s0[0], s0[1], x_filter, correction,
- range_limit, permute_tbl);
- d1 = convolve12_horiz_8_sdot(s1[0], s1[1], x_filter, correction,
- range_limit, permute_tbl);
- d2 = convolve12_horiz_8_sdot(s2[0], s2[1], x_filter, correction,
- range_limit, permute_tbl);
- d3 = convolve12_horiz_8_sdot(s3[0], s3[1], x_filter, correction,
- range_limit, permute_tbl);
-
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
- } while (h >= 4);
-
- for (; h > 0; h--) {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0[2];
- int16x8_t d0;
-
- s0[0] = vld1q_u8(s);
- s0[1] = vld1q_u8(s + 4);
-
- d0 = convolve12_horiz_8_sdot(s0[0], s0[1], x_filter, correction,
- range_limit, permute_tbl);
-
- vst1q_s16(d, d0);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
- }
- }
-}
-
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE int16x4_t convolve12_horiz_4x4_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
- const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
- const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
- const int32x4_t horiz_const) {
+static INLINE int16x4_t
+convolve12_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
+ const int32x4_t horiz_const) {
const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
- int32x4_t sum;
- sum = horiz_const;
+ int32x4_t sum = horiz_const;
sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
@@ -2526,136 +1033,68 @@ static INLINE int16x4_t convolve12_horiz_4x4_s16(
return vshrn_n_s32(sum, ROUND0_BITS);
}
-// 4 column per iteration horizontal filtering for 12-tap convolve_2d_sr.
-// Processes one row at a time.
-static INLINE void horiz_filter_12tap_w4_single_row(
- const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
- const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
- const int16x4_t x_filter_8_11, const int32x4_t horiz_const) {
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, d0;
- uint8x16_t t0;
- int16x8_t tt0, tt1;
-
- t0 = vld1q_u8(s);
- tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
- tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
-
- s0 = vget_low_s16(tt0);
- s4 = vget_high_s16(tt0);
- s8 = vget_low_s16(tt1);
- s12 = vget_high_s16(tt1);
-
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
- s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
- s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
- s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
-
- d0 = convolve12_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, x_filter_0_7, x_filter_8_11,
- horiz_const);
-
- if (w == 2) {
- store_s16_2x1(d, d0, 0);
- } else {
- vst1_s16(d, d0);
- }
-
- s += 4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- h--;
- } while (h > 0);
-}
-
static INLINE void convolve_2d_sr_horiz_12tap_neon(
const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
const int16x4_t x_filter_8_11) {
const int bd = 8;
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
// which are generally faster than rounding shifts on modern CPUs.
const int32x4_t horiz_const =
vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
#if AOM_ARCH_AARCH64
do {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x8_t t0, t1, t2, t3;
-
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
+ uint8x8_t t0, t1, t2, t3;
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
s += 11;
do {
- int16x4_t s11, s12, s13, s14, d0, d1, d2, d3;
-
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve12_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, x_filter_0_7, x_filter_8_11,
- horiz_const);
- d1 = convolve12_horiz_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, s12, x_filter_0_7, x_filter_8_11,
- horiz_const);
- d2 = convolve12_horiz_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, s13, x_filter_0_7, x_filter_8_11,
- horiz_const);
- d3 = convolve12_horiz_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, s14, x_filter_0_7, x_filter_8_11,
- horiz_const);
-
- transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-
- if (w == 2) {
- store_s16_2x1(d + 0 * dst_stride, d0, 0);
- store_s16_2x1(d + 1 * dst_stride, d1, 0);
- store_s16_2x1(d + 2 * dst_stride, d2, 0);
- store_s16_2x1(d + 3 * dst_stride, d3, 0);
- } else {
- store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
- }
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d1 =
+ convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d2 =
+ convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d3 =
+ convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+
+ transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+ store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -2668,562 +1107,192 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon(
s8 = s12;
s9 = s13;
s10 = s14;
-
s += 4;
d += 4;
width -= 4;
- } while (width > 0);
-
+ } while (width != 0);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
- } while (h >= 4);
-
- if (h) {
- horiz_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride,
- w, h, x_filter_0_7, x_filter_8_11,
- horiz_const);
- }
-#else // !AOM_ARCH_AARCH64
- horiz_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
- h, x_filter_0_7, x_filter_8_11, horiz_const);
+ } while (h > 4);
#endif // AOM_ARCH_AARCH64
-}
-
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE void convolve_2d_sr_horiz_8tap_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
- int im_h, const int16x8_t x_filter_s16) {
- const int bd = 8;
-
- const uint8_t *src_ptr = src;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
-
- int height = im_h;
-
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // The outermost -1 is needed because we halved the filter values.
- const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
-
- if (w <= 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int32x4_t t0, t1, t2, t3;
- int16x4_t d0, d1, d2, d3;
-
- do {
- assert(height >= 4);
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
- t1 = convolve8_4_usdot(s1, x_filter, permute_tbl, horiz_const);
- t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
- t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
-
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
- d1 = vshrn_n_s32(t1, ROUND0_BITS - 1);
- d2 = vshrn_n_s32(t2, ROUND0_BITS - 1);
- d3 = vshrn_n_s32(t3, ROUND0_BITS - 1);
-
- if (w == 2) {
- store_s16_2x1(dst_ptr + 0 * dst_stride, d0, 0);
- store_s16_2x1(dst_ptr + 1 * dst_stride, d1, 0);
- store_s16_2x1(dst_ptr + 2 * dst_stride, d2, 0);
- store_s16_2x1(dst_ptr + 3 * dst_stride, d3, 0);
- } else {
- store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
- }
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height >= 4);
-
- if (height) {
- assert(height < 4);
-
- do {
- s0 = vld1q_u8(src_ptr);
- t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
-
- if (w == 2) {
- store_s16_2x1(dst_ptr, d0, 0);
- } else {
- vst1_s16(dst_ptr, d0);
- }
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
- }
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x8_t d0, d1, d2, d3;
-
- do {
- assert(height >= 4);
-
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_horiz_8_usdot(s0, x_filter, permute_tbl, horiz_const);
- d1 = convolve8_horiz_8_usdot(s1, x_filter, permute_tbl, horiz_const);
- d2 = convolve8_horiz_8_usdot(s2, x_filter, permute_tbl, horiz_const);
- d3 = convolve8_horiz_8_usdot(s3, x_filter, permute_tbl, horiz_const);
-
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height >= 4);
-
- if (height) {
- assert(height < 4);
-
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- s0 = vld1q_u8(s);
- d0 = convolve8_horiz_8_usdot(s0, x_filter, permute_tbl, horiz_const);
- vst1q_s16(d, d0);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
- }
- }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void convolve_2d_sr_horiz_8tap_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
- int im_h, const int16x8_t x_filter_s16) {
- const int bd = 8;
-
- const uint8_t *src_ptr = src;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
-
- int height = im_h;
-
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // The outermost -1 is needed because we halved the filter values.
- const int32_t horiz_const =
- ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
- // Dot product constants.
- const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
- int32x4_t correction = vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
- if (w <= 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int32x4_t t0, t1, t2, t3;
- int16x4_t d0, d1, d2, d3;
-
- do {
- assert(height >= 4);
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
- t1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
- t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
- t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
-
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
- d1 = vshrn_n_s32(t1, ROUND0_BITS - 1);
- d2 = vshrn_n_s32(t2, ROUND0_BITS - 1);
- d3 = vshrn_n_s32(t3, ROUND0_BITS - 1);
-
- if (w == 2) {
- store_s16_2x1(dst_ptr + 0 * dst_stride, d0, 0);
- store_s16_2x1(dst_ptr + 1 * dst_stride, d1, 0);
- store_s16_2x1(dst_ptr + 2 * dst_stride, d2, 0);
- store_s16_2x1(dst_ptr + 3 * dst_stride, d3, 0);
- } else {
- store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
- }
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height >= 4);
-
- if (height) {
- assert(height < 4);
-
- do {
- s0 = vld1q_u8(src_ptr);
- t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit,
- permute_tbl);
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
-
- if (w == 2) {
- store_s16_2x1(dst_ptr, d0, 0);
- } else {
- vst1_s16(dst_ptr, d0);
- }
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
- }
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x8_t d0, d1, d2, d3;
-
- do {
- assert(height >= 4);
-
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_horiz_8_sdot(s0, x_filter, correction, range_limit,
- permute_tbl);
- d1 = convolve8_horiz_8_sdot(s1, x_filter, correction, range_limit,
- permute_tbl);
- d2 = convolve8_horiz_8_sdot(s2, x_filter, correction, range_limit,
- permute_tbl);
- d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
- permute_tbl);
-
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height >= 4);
-
- if (height) {
- assert(height < 4);
-
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- s0 = vld1q_u8(s);
- d0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
- permute_tbl, vdupq_n_s16(0));
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
- vst1q_s16(d, d0);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
- }
- }
-}
-
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-// Horizontal filtering for convolve_2d_sr for width multiple of 8
-// Processes one row at a time
-static INLINE void horiz_filter_w8_single_row(const uint8_t *src_ptr,
- int src_stride, int16_t *dst_ptr,
- const int dst_stride, int width,
- int height,
- const int16x8_t x_filter,
- const int16x8_t horiz_const) {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
do {
- uint8x8_t t0 = vld1_u8(src_ptr);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
-
- int width_tmp = width;
- const uint8_t *s = src_ptr + 8;
- int16_t *dst_tmp = dst_ptr;
-
- __builtin_prefetch(dst_ptr);
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
do {
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- int16x8_t sum = s0;
- s0 = s7;
-
- s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
-
- int16x8_t res0 = convolve8_horiz_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
- x_filter, horiz_const);
-
- vst1q_s16(dst_tmp, res0);
-
- s += 8;
- dst_tmp += 8;
- width_tmp -= 8;
- } while (width_tmp > 0);
+ uint8x16_t t0 = vld1q_u8(s);
+ int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+
+ int16x4_t s0 = vget_low_s16(tt0);
+ int16x4_t s4 = vget_high_s16(tt0);
+ int16x4_t s8 = vget_low_s16(tt1);
+ int16x4_t s12 = vget_high_s16(tt1);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
+ int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
+ int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
+ int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
+ int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
+ int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
+
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ vst1_s16(d, d0);
+
+ s += 4;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
src_ptr += src_stride;
dst_ptr += dst_stride;
- height--;
- } while (height > 0);
+ } while (--h != 0);
}
-// Horizontal filtering for convolve_2d_sr for width <= 4
-// Processes one row at a time
-static INLINE void horiz_filter_w4_single_row(const uint8_t *src_ptr,
- int src_stride, int16_t *dst_ptr,
- const int dst_stride, int width,
- int height,
- const int16x8_t x_filter,
- const int16x4_t horiz_const) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- do {
- const uint8_t *s = src_ptr;
-
- __builtin_prefetch(s);
-
- uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
- int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s0 = vget_low_s16(tt0);
- s4 = vget_high_s16(tt0);
-
- __builtin_prefetch(dst_ptr);
- s += 8;
-
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t filter,
+ const int16x4_t horiz_const) {
+ int16x4_t sum = horiz_const;
+ sum = vmla_lane_s16(sum, s0, filter, 0);
+ sum = vmla_lane_s16(sum, s1, filter, 1);
+ sum = vmla_lane_s16(sum, s2, filter, 2);
+ sum = vmla_lane_s16(sum, s3, filter, 3);
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshr_n_s16(sum, ROUND0_BITS - 1);
+}
- int16x4_t d0 = convolve8_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- x_filter, horiz_const);
+static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter,
+ const int16x8_t horiz_const) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
- if (width == 2) {
- store_s16_2x1(dst_ptr, d0, 0);
- } else {
- vst1_s16(dst_ptr, d0);
- }
+ int16x8_t sum = horiz_const;
+ sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
- dst_ptr += dst_stride;
- src_ptr += src_stride;
- height--;
- } while (height > 0);
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrq_n_s16(sum, ROUND0_BITS - 1);
}
-static INLINE void convolve_2d_sr_horiz_8tap_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
- int im_h, const int16x8_t x_filter_s16) {
+static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
+ int16_t *im_block, int im_stride,
+ int w, int im_h,
+ const int16_t *x_filter_ptr) {
const int bd = 8;
const uint8_t *src_ptr = src;
int16_t *dst_ptr = im_block;
int dst_stride = im_stride;
-
int height = im_h;
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int16x8_t x_filter = vshrq_n_s16(x_filter_s16, 1);
-
if (w <= 4) {
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
- // The outermost -1 is needed because we halved the filter values.
+ // (The extra -1 is needed because we halved the filter values.)
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
-#if AOM_ARCH_AARCH64
- do {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- uint8x8_t t0, t1, t2, t3;
- const uint8_t *s = src_ptr;
-
- assert(height >= 4);
-
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- s += 7;
+ src_ptr += 2;
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve8_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- d1 = convolve8_horiz_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const);
- d2 = convolve8_horiz_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const);
- d3 = convolve8_horiz_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const);
-
- transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-
- if (w == 2) {
- store_s16_2x1(dst_ptr + 0 * dst_stride, d0, 0);
- store_s16_2x1(dst_ptr + 1 * dst_stride, d1, 0);
- store_s16_2x1(dst_ptr + 2 * dst_stride, d2, 0);
- store_s16_2x1(dst_ptr + 3 * dst_stride, d3, 0);
- } else {
- store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
- }
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height >= 4);
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- if (height) {
- assert(height < 4);
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
- height, x_filter, horiz_const);
- }
+ int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
-#else // !AOM_ARCH_AARCH64
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
- height, x_filter, horiz_const);
-#endif // AOM_ARCH_AARCH64
+ vst1_s16(dst_ptr, d0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
} else {
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
- // The outermost -1 is needed because we halved the filter values.
+ // (The extra -1 is needed because we halved the filter values.)
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
#if AOM_ARCH_AARCH64
-
- for (; height >= 8; height -= 8) {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
- d0, d1, d2, d3, d4, d5, d6, d7;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
+ while (height > 8) {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
s += 7;
do {
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_horiz_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- d1 = convolve8_horiz_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const);
- d2 = convolve8_horiz_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const);
- d3 = convolve8_horiz_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const);
- d4 = convolve8_horiz_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- horiz_const);
- d5 = convolve8_horiz_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
- x_filter, horiz_const);
- d6 = convolve8_horiz_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
- x_filter, horiz_const);
- d7 = convolve8_horiz_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
- x_filter, horiz_const);
-
- transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
+ x_filter, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
+ x_filter, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
+ x_filter, horiz_const);
+ int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, horiz_const);
+ int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, horiz_const);
+ int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, horiz_const);
+ int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, horiz_const);
+
+ transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
@@ -3237,996 +1306,361 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon(
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
-
+ } while (width != 0);
src_ptr += 8 * src_stride;
dst_ptr += 8 * dst_stride;
+ height -= 8;
}
+#endif // AOM_ARCH_AARCH64
- for (; height >= 4; height -= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
- dd0, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- int16x8_t d0, d1, d2, d3;
- uint8x8_t t0, t1, t2, t3;
-
+ do {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
- load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ do {
+ uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
- s += 7;
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
- do {
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- dd0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
- dd1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
- dd2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
- dd3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
- dd4 = convolve8_4x4(s4, s5, s6, s7, s8, s9, s10, s11, x_filter);
- dd5 = convolve8_4x4(s5, s6, s7, s8, s9, s10, s11, s12, x_filter);
- dd6 = convolve8_4x4(s6, s7, s8, s9, s10, s11, s12, s13, x_filter);
- dd7 = convolve8_4x4(s7, s8, s9, s10, s11, s12, s13, s14, x_filter);
-
- transpose_s16_4x8(&dd0, &dd1, &dd2, &dd3, &dd4, &dd5, &dd6, &dd7, &d0,
- &d1, &d2, &d3);
-
- d0 = vaddq_s16(d0, horiz_const);
- d1 = vaddq_s16(d1, horiz_const);
- d2 = vaddq_s16(d2, horiz_const);
- d3 = vaddq_s16(d3, horiz_const);
-
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
- d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
- d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
- d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+ vst1q_s16(d, d0);
s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- }
-
- if (height) {
- assert(height < 4);
- horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
- height, x_filter, horiz_const);
- }
-
-#else // !AOM_ARCH_AARCH64
- horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
- height, x_filter, horiz_const);
-#endif // AOM_ARCH_AARCH64
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
}
}
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int32x4_t convolve12_vert_4_s32(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
- const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
- int32x4_t sum;
-
- sum = vmull_lane_s16(s0, y_filter_0_3, 0);
- sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
- sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
- sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
- sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
- sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
- sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
- sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
- sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
- sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
- sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
- sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
-
- return sum;
-}
-
-static INLINE uint8x8_t convolve12_vert_8_s32(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
- const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
- const int16x8_t sub_const) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
- int32x4_t sum0, sum1;
- int16x8_t res;
-
- sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
-
- sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
-
- res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
- vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
- res = vsubq_s16(res, sub_const);
-
- return vqmovun_s16(res);
-}
-
-static INLINE void convolve_2d_sr_vert_12tap_neon(
- int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
- int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
- const int bd = 8;
- const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
-
- if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
- int32x4_t d0, d1, d2, d3;
- int16x8_t dd01, dd23;
- uint8x8_t d01, d23;
-
- load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
- &s8, &s9, &s10);
- src_ptr += 11 * src_stride;
-
- do {
- load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
-
- d0 = convolve12_vert_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, y_filter_0_7, y_filter_8_11);
- d1 = convolve12_vert_4_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, y_filter_0_7, y_filter_8_11);
- d2 = convolve12_vert_4_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, y_filter_0_7, y_filter_8_11);
- d3 = convolve12_vert_4_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
- s14, y_filter_0_7, y_filter_8_11);
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
- dd01 = vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
- vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
- dd23 = vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
- vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
- dd01 = vsubq_s16(dd01, sub_const);
- dd23 = vsubq_s16(dd23, sub_const);
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
- d01 = vqmovun_s16(dd01);
- d23 = vqmovun_s16(dd23);
+ if (filter_params_x->taps > 8) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- if (w == 2) {
- store_u8_2x1(dst_ptr + 0 * dst_stride, d01, 0);
- store_u8_2x1(dst_ptr + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(dst_ptr + 2 * dst_stride, d23, 0);
- store_u8_2x1(dst_ptr + 3 * dst_stride, d23, 2);
- }
- } else {
- store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
- }
- }
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s7 = s11;
- s8 = s12;
- s9 = s13;
- s10 = s14;
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
+ convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w,
+ im_h, x_filter_0_7, x_filter_8_11);
+ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_0_7, y_filter_8_11);
} else {
- do {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
- uint8x8_t d0, d1, d2, d3;
-
- int16_t *s = src_ptr;
- uint8_t *d = dst_ptr;
-
- int height = h;
-
- load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
- &s9, &s10);
- s += 11 * src_stride;
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- do {
- load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
-
- d0 = convolve12_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, y_filter_0_7, y_filter_8_11, sub_const);
- d1 = convolve12_vert_8_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, y_filter_0_7, y_filter_8_11, sub_const);
- d2 =
- convolve12_vert_8_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, y_filter_0_7, y_filter_8_11, sub_const);
- d3 = convolve12_vert_8_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
- s13, s14, y_filter_0_7, y_filter_8_11,
- sub_const);
-
- if (h != 2) {
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- } else {
- store_u8_8x2(d, dst_stride, d0, d1);
- }
+ convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
+ x_filter_ptr);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s7 = s11;
- s8 = s12;
- s9 = s13;
- s10 = s14;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
- src_ptr += 8;
- dst_ptr += 8;
- w -= 8;
- } while (w > 0);
+ if (clamped_y_taps <= 6) {
+ convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ } else {
+ convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ }
}
}
-static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
- int src_stride,
- uint8_t *dst_ptr,
- int dst_stride, int w, int h,
- const int16x8_t y_filter) {
- const int bd = 8;
- const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(subpel_x_qn == 8);
+ assert(filter_params_x->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)conv_params;
if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
- uint8x8_t d01;
-
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10, d1, d2, d3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- int16_t *s = src_ptr;
- uint8_t *d = dst_ptr;
-
- load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_vert_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
- d1 = convolve8_vert_4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
- d2 = convolve8_vert_4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
- d3 = convolve8_vert_4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+ uint8x8_t s0_0 = vld1_u8(src);
+ uint8x8_t s0_1 = vld1_u8(src + 1);
+ uint8x8_t s1_0 = vld1_u8(src + src_stride);
+ uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
- d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
- d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+ uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
+ uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
if (w == 2) {
- store_u8_2x1(d + 0 * dst_stride, d01, 0);
- store_u8_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(d + 2 * dst_stride, d23, 0);
- store_u8_2x1(d + 3 * dst_stride, d23, 2);
- }
+ store_u8_2x1(dst + 0 * dst_stride, d0, 0);
+ store_u8_2x1(dst + 1 * dst_stride, d1, 0);
} else {
- store_u8_4x1(d + 0 * dst_stride, d01, 0);
- store_u8_4x1(d + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(d + 2 * dst_stride, d23, 0);
- store_u8_4x1(d + 3 * dst_stride, d23, 1);
- }
+ store_u8_4x1(dst + 0 * dst_stride, d0, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d1, 0);
}
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1_s16(s);
- s += src_stride;
-
- d0 = convolve8_vert_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t s0_0 = vld1_u8(src);
+ uint8x8_t s0_1 = vld1_u8(src + 1);
+ uint8x8_t s1_0 = vld1_u8(src + src_stride);
+ uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
- d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+ uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
+ uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
- if (w == 2) {
- store_u8_2x1(d, d01, 0);
- } else {
- store_u8_4x1(d, d01, 0);
- }
+ vst1_u8(dst, d0);
+ vst1_u8(dst + dst_stride, d1);
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- d += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h > 0);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
} else {
- // if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint8x8_t d0;
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10;
- uint8x8_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
do {
- int height = h;
- int16_t *s = src_ptr;
- uint8_t *d = dst_ptr;
-
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
+ const uint8_t *src_ptr = src;
+ uint8_t *dst_ptr = dst;
+ int width = w;
do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- sub_const);
- d1 = convolve8_vert_8_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- sub_const);
- d2 = convolve8_vert_8_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- sub_const);
- d3 = convolve8_vert_8_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- sub_const);
-
- if (h != 2) {
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- } else {
- store_u8_8x2(d, dst_stride, d0, d1);
- }
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t s1 = vld1q_u8(src_ptr + 1);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1q_s16(s);
-
- d0 = convolve8_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- sub_const);
-
- vst1_u8(d, d0);
+ uint8x16_t d0 = vrhaddq_u8(s0, s1);
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height > 0);
+ vst1q_u8(dst_ptr, d0);
- src_ptr += 8;
- dst_ptr += 8;
- w -= 8;
- } while (w > 0);
+ src_ptr += 16;
+ dst_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
}
}
-static INLINE int16x4_t
-convolve6_vert_4_s32(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x8_t y_filter) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
- int32x4_t sum;
-
- sum = vmull_lane_s16(s0, y_filter_lo, 1);
- sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
- sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
- sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
- sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
- sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
-
- return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE uint8x8_t
-convolve6_vert_8_s32(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t y_filter, const int16x8_t sub_const) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
- int32x4_t sum0, sum1;
- int16x8_t res;
-
- sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
-
- sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
-
- res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
- vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
- res = vsubq_s16(res, sub_const);
-
- return vqmovun_s16(res);
-}
-
-static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
- int src_stride,
- uint8_t *dst_ptr,
- int dst_stride, int w, int h,
- const int16x8_t y_filter) {
- const int bd = 8;
- const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ assert(subpel_y_qn == 8);
+ assert(filter_params_y->taps == 2);
+ (void)filter_params_y;
+ (void)subpel_y_qn;
if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, d0;
- uint8x8_t d01;
-
-#if AOM_ARCH_AARCH64
- int16x4_t s6, s7, s8, d1, d2, d3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- int16_t *s = src_ptr;
- uint8_t *d = dst_ptr;
-
- load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
- s += 5 * src_stride;
-
do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+ uint8x8_t s0 = load_unaligned_u8_4x1(src);
+ uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
+ uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
- d0 = convolve6_vert_4_s32(s0, s1, s2, s3, s4, s5, y_filter);
- d1 = convolve6_vert_4_s32(s1, s2, s3, s4, s5, s6, y_filter);
- d2 = convolve6_vert_4_s32(s2, s3, s4, s5, s6, s7, y_filter);
- d3 = convolve6_vert_4_s32(s3, s4, s5, s6, s7, s8, y_filter);
-
- d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
- d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+ uint8x8_t d0 = vrhadd_u8(s0, s1);
+ uint8x8_t d1 = vrhadd_u8(s1, s2);
if (w == 2) {
- store_u8_2x1(d + 0 * dst_stride, d01, 0);
- store_u8_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u8_2x1(d + 2 * dst_stride, d23, 0);
- store_u8_2x1(d + 3 * dst_stride, d23, 2);
- }
+ store_u8_2x1(dst + 0 * dst_stride, d0, 0);
+ store_u8_2x1(dst + 1 * dst_stride, d1, 0);
} else {
- store_u8_4x1(d + 0 * dst_stride, d01, 0);
- store_u8_4x1(d + 1 * dst_stride, d01, 1);
- if (h != 2) {
- store_u8_4x1(d + 2 * dst_stride, d23, 0);
- store_u8_4x1(d + 3 * dst_stride, d23, 1);
- }
+ store_u8_4x1(dst + 0 * dst_stride, d0, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d1, 0);
}
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1_s16(s);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + src_stride);
+ uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
- d0 = convolve6_vert_4_s32(s0, s1, s2, s3, s4, s5, y_filter);
- d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+ uint8x8_t d0 = vrhadd_u8(s0, s1);
+ uint8x8_t d1 = vrhadd_u8(s1, s2);
- if (w == 2) {
- store_u8_2x1(d, d01, 0);
- } else {
- store_u8_4x1(d, d01, 0);
- }
+ vst1_u8(dst, d0);
+ vst1_u8(dst + dst_stride, d1);
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h > 0);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
} else {
- // if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5;
- uint8x8_t d0;
-#if AOM_ARCH_AARCH64
- int16x8_t s6, s7, s8;
- uint8x8_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
do {
+ const uint8_t *src_ptr = src;
+ uint8_t *dst_ptr = dst;
int height = h;
- int16_t *s = src_ptr;
- uint8_t *d = dst_ptr;
-
- load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
- s += 5 * src_stride;
do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
- d0 = convolve6_vert_8_s32(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
- d1 = convolve6_vert_8_s32(s1, s2, s3, s4, s5, s6, y_filter, sub_const);
- d2 = convolve6_vert_8_s32(s2, s3, s4, s5, s6, s7, y_filter, sub_const);
- d3 = convolve6_vert_8_s32(s3, s4, s5, s6, s7, s8, y_filter, sub_const);
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
- if (h != 2) {
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- } else {
- store_u8_8x2(d, dst_stride, d0, d1);
- }
+ uint8x16_t d0 = vrhaddq_u8(s0, s1);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1q_s16(s);
+ vst1q_u8(dst_ptr, d0);
- d0 = convolve6_vert_8_s32(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
-
- vst1_u8(d, d0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height > 0);
-
- src_ptr += 8;
- dst_ptr += 8;
- w -= 8;
- } while (w > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ } while (w != 0);
}
}
-void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int subpel_y_qn,
- ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ assert(subpel_x_qn == 8);
+ assert(subpel_y_qn == 8);
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)filter_params_y;
+ (void)subpel_y_qn;
(void)conv_params;
- const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
- const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
- const int im_h = h + clamped_y_taps - 1;
- const int im_stride = MAX_SB_SIZE;
- const int vert_offset = clamped_y_taps / 2 - 1;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + 1;
+ int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
- if (filter_params_x->taps > 8) {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ uint16_t *im = im_block;
- const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
- const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
- const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
- const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+ // Horizontal filter.
+ if (w <= 4) {
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
- convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w,
- im_h, x_filter_0_7, x_filter_8_11);
+ uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
- convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
- y_filter_0_7, y_filter_8_11);
+ // Safe to store the whole vector, the im buffer is big enough.
+ vst1_u16(im, sum);
+
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
} else {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+ do {
+ const uint8_t *src_ptr = src;
+ uint16_t *im_ptr = im;
+ int width = w;
- const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + 1);
- convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride, w,
- im_h, x_filter);
+ uint16x8_t sum = vaddl_u8(s0, s1);
- if (clamped_y_taps <= 6) {
- convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
- y_filter);
- } else {
- convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
- y_filter);
- }
- }
-}
+ vst1q_u16(im_ptr, sum);
-static INLINE void scaledconvolve_horiz_w4(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
- const int x0_q4, const int x_step_q4, const int w, const int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
- int x, y, z;
+ src_ptr += 8;
+ im_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
+ }
- src -= SUBPEL_TAPS / 2 - 1;
+ im = im_block;
- y = h;
- do {
- int x_q4 = x0_q4;
- x = 0;
+ // Vertical filter.
+ if (w <= 4) {
do {
- // process 4 src_x steps
- for (z = 0; z < 4; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- if (x_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- int16x8_t ss[4];
- int16x4_t t[8], tt;
-
- load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
- transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
-
- ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
- ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
- ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
- ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
- t[0] = vget_low_s16(ss[0]);
- t[1] = vget_low_s16(ss[1]);
- t[2] = vget_low_s16(ss[2]);
- t[3] = vget_low_s16(ss[3]);
- t[4] = vget_high_s16(ss[0]);
- t[5] = vget_high_s16(ss[1]);
- t[6] = vget_high_s16(ss[2]);
- t[7] = vget_high_s16(ss[3]);
-
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
- filters);
- d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
- store_u8_4x1(&temp[4 * z], d, 0);
- } else {
- int i;
- for (i = 0; i < 4; ++i) {
- temp[z * 4 + i] = src_x[i * src_stride + 3];
- }
- }
- x_q4 += x_step_q4;
- }
-
- // transpose the 4x4 filters values back to dst
- {
- const uint8x8x4_t d4 = vld4_u8(temp);
- store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0], 0);
- store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1], 0);
- store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2], 0);
- store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3], 0);
- }
- x += 4;
- } while (x < w);
-
- src += src_stride * 4;
- dst += dst_stride * 4;
- y -= 4;
- } while (y > 0);
-}
+ uint16x4_t s0 = vld1_u16(im);
+ uint16x4_t s1 = vld1_u16(im + im_stride);
+ uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
-static INLINE void scaledconvolve_horiz_w8(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
- const int x0_q4, const int x_step_q4, const int w, const int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
- int x, y, z;
- src -= SUBPEL_TAPS / 2 - 1;
+ uint16x4_t sum0 = vadd_u16(s0, s1);
+ uint16x4_t sum1 = vadd_u16(s1, s2);
- // This function processes 8x8 areas. The intermediate height is not always
- // a multiple of 8, so force it to be a multiple of 8 here.
- y = (h + 7) & ~7;
+ uint8x8_t d01 = vqrshrn_n_u16(vcombine_u16(sum0, sum1), 2);
- do {
- int x_q4 = x0_q4;
- x = 0;
- do {
- uint8x8_t d[8];
- // process 8 src_x steps
- for (z = 0; z < 8; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-
- if (x_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- uint8x8_t s[8];
- load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
- &s[5], &s[6], &s[7]);
- transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
- &s[7]);
- d[0] = scale_filter_8(s, filters);
- vst1_u8(&temp[8 * z], d[0]);
- } else {
- int i;
- for (i = 0; i < 8; ++i) {
- temp[z * 8 + i] = src_x[i * src_stride + 3];
- }
- }
- x_q4 += x_step_q4;
+ if (w == 2) {
+ store_u8_2x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_2x1(dst + 1 * dst_stride, d01, 2);
+ } else {
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
}
- // transpose the 8x8 filters values back to dst
- load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
- &d[7]);
- transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
- store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
- d[6], d[7]);
- x += 8;
- } while (x < w);
-
- src += src_stride * 8;
- dst += dst_stride * 8;
- } while (y -= 8);
-}
-
-static INLINE void scaledconvolve_vert_w4(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
- do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-
- if (y_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- int16x4_t t[8], tt;
-
- load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
- &s[6], &s[7]);
- t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
- t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
- t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
- t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
- t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
- t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
- t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
- t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
- d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
- store_u8_4x1(dst, d, 0);
- } else {
- memcpy(dst, &src_y[3 * src_stride], w);
- }
-
- dst += dst_stride;
- y_q4 += y_step_q4;
- } while (--y);
-}
-
-static INLINE void scaledconvolve_vert_w8(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
- do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- if (y_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
- &s[6], &s[7]);
- d = scale_filter_8(s, filters);
- vst1_u8(dst, d);
- } else {
- memcpy(dst, &src_y[3 * src_stride], w);
- }
- dst += dst_stride;
- y_q4 += y_step_q4;
- } while (--y);
-}
-
-static INLINE void scaledconvolve_vert_w16(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int x, y;
- int y_q4 = y0_q4;
+ im += 2 * im_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ do {
+ uint16_t *im_ptr = im;
+ uint8_t *dst_ptr = dst;
+ int height = h;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
- do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- if (y_q4 & SUBPEL_MASK) {
- x = 0;
do {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x16_t ss[8];
- uint8x8_t s[8], d[2];
- load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
- &ss[5], &ss[6], &ss[7]);
- s[0] = vget_low_u8(ss[0]);
- s[1] = vget_low_u8(ss[1]);
- s[2] = vget_low_u8(ss[2]);
- s[3] = vget_low_u8(ss[3]);
- s[4] = vget_low_u8(ss[4]);
- s[5] = vget_low_u8(ss[5]);
- s[6] = vget_low_u8(ss[6]);
- s[7] = vget_low_u8(ss[7]);
- d[0] = scale_filter_8(s, filters);
-
- s[0] = vget_high_u8(ss[0]);
- s[1] = vget_high_u8(ss[1]);
- s[2] = vget_high_u8(ss[2]);
- s[3] = vget_high_u8(ss[3]);
- s[4] = vget_high_u8(ss[4]);
- s[5] = vget_high_u8(ss[5]);
- s[6] = vget_high_u8(ss[6]);
- s[7] = vget_high_u8(ss[7]);
- d[1] = scale_filter_8(s, filters);
- vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
- src_y += 16;
- x += 16;
- } while (x < w);
- } else {
- memcpy(dst, &src_y[3 * src_stride], w);
- }
- dst += dst_stride;
- y_q4 += y_step_q4;
- } while (--y);
-}
+ uint16x8_t s0 = vld1q_u16(im_ptr);
+ uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
-void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const InterpKernel *filter,
- int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- // --Require an additional 8 rows for the horiz_w8 transpose tail.
- // When calling in frame scaling function, the smallest scaling factor is x1/4
- // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
- // big enough.
- DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
- assert(x_step_q4 <= 64);
-
- if (w >= 8) {
- scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
- intermediate_height);
- } else {
- scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
- intermediate_height);
- }
+ uint16x8_t sum = vaddq_u16(s0, s1);
+ uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
- if (w >= 16) {
- scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- } else if (w == 8) {
- scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- } else {
- scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
+ vst1_u8(dst_ptr, d0);
+
+ im_ptr += im_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ im += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
}
}
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 14a6ebe55..6b8edf871 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -15,549 +15,530 @@
#include "config/aom_config.h"
-#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
-
-static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t filter) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x4_t sum;
-
- sum = vmul_lane_s16(s0, filter_lo, 0);
- sum = vmla_lane_s16(sum, s1, filter_lo, 1);
- sum = vmla_lane_s16(sum, s2, filter_lo, 2);
- sum = vmla_lane_s16(sum, s5, filter_hi, 1);
- sum = vmla_lane_s16(sum, s6, filter_hi, 2);
- sum = vmla_lane_s16(sum, s7, filter_hi, 3);
- sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
- sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
- return sum;
-}
-
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t filter) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x8_t sum;
-
- sum = vmulq_lane_s16(s0, filter_lo, 0);
- sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
- sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
- sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
- sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
- sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
- sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
- sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
- return vqrshrun_n_s16(sum, 7);
-}
-
-static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
- const int16x8_t filter) {
- int16x8_t ss[8];
-
- ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
- ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
- ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
- ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
- ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
- ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
- ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
- ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
-
- return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
- filter);
-}
-
-static INLINE uint8x8_t wiener_convolve8_vert_4x8(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, int16_t *filter_y, const int bd,
- const int round1_bits) {
- int16x8_t ss0, ss1, ss2;
- int32x4_t sum0, sum1;
- int16x8_t tmp;
- uint8x8_t res;
-
- const int32_t round_const = (1 << (bd + round1_bits - 1));
- const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
- const int32x4_t round_vec = vdupq_n_s32(round_const);
- const int16x4_t filter = vld1_s16(filter_y);
-
- ss0 = vaddq_s16(s0, s6);
- ss1 = vaddq_s16(s1, s5);
- ss2 = vaddq_s16(s2, s4);
-
- sum0 = vmull_lane_s16(vget_low_s16(ss0), filter, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss1), filter, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss2), filter, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
-
- sum1 = vmull_lane_s16(vget_high_s16(ss0), filter, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss1), filter, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss2), filter, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
-
- sum0 = vsubq_s32(sum0, round_vec);
- sum1 = vsubq_s32(sum1, round_vec);
-
- /* right shift & rounding */
- sum0 = vrshlq_s32(sum0, round_bits);
- sum1 = vrshlq_s32(sum1, round_bits);
-
- /* from int32x4_t to uint8x8_t */
- tmp = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- res = vqmovun_s16(tmp);
-
- return res;
-}
-
-static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, int16_t *filter_x, const int bd,
- const int round0_bits) {
- int16x8_t sum;
- uint16x8_t res;
- int32x4_t sum_0, sum_1;
- int32x4_t s3_0, s3_1;
- const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
- const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
-
- /* for the purpose of right shift by { conv_params->round_0 } */
- const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
-
- const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
- const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
- const int16x4_t filter = vld1_s16(filter_x);
-
- sum = vmulq_lane_s16(s0, filter, 0);
- sum = vmlaq_lane_s16(sum, s1, filter, 1);
- sum = vmlaq_lane_s16(sum, s2, filter, 2);
-
- /* sum from 16x8 to 2 32x4 registers */
- sum_0 = vmovl_s16(vget_low_s16(sum));
- sum_1 = vmovl_s16(vget_high_s16(sum));
-
- /* s[3]*128 -- and filter coef max can be 128
- * then max value possible = 128*128*255 exceeding 16 bit
- */
-
- s3_0 = vmull_lane_s16(vget_low_s16(s3), filter, 3);
- s3_1 = vmull_lane_s16(vget_high_s16(s3), filter, 3);
- sum_0 = vaddq_s32(sum_0, s3_0);
- sum_1 = vaddq_s32(sum_1, s3_1);
-
- /* Add the constant value */
- sum_0 = vaddq_s32(sum_0, round_vec_0);
- sum_1 = vaddq_s32(sum_1, round_vec_0);
-
- /* right shift & rounding & saturating */
- sum_0 = vqrshlq_s32(sum_0, round_bits);
- sum_1 = vqrshlq_s32(sum_1, round_bits);
-
- /* Clipping to max value */
- sum_0 = vminq_s32(sum_0, round_vec_1);
- sum_1 = vminq_s32(sum_1, round_vec_1);
-
- res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1));
- return res;
-}
-
-static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, int16_t *filter_x, const int bd,
- const int round0_bits) {
- uint16x4_t res;
- int32x4_t sum_0, s3_0;
- int16x4_t sum, temp0, temp1, temp2;
-
- const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
- const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
- const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
- const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
- const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
- const int16x4_t filter = vld1_s16(filter_x);
-
- temp0 = vadd_s16(s0, s6);
- temp1 = vadd_s16(s1, s5);
- temp2 = vadd_s16(s2, s4);
-
- sum = vmul_lane_s16(temp0, filter, 0);
- sum = vmla_lane_s16(sum, temp1, filter, 1);
- sum = vmla_lane_s16(sum, temp2, filter, 2);
- sum_0 = vmovl_s16(sum);
-
- /* s[3]*128 -- and filter coff max can be 128.
- * then max value possible = 128*128*255 Therefore, 32 bits are required to
- * hold the result.
- */
- s3_0 = vmull_lane_s16(s3, filter, 3);
- sum_0 = vaddq_s32(sum_0, s3_0);
-
- sum_0 = vaddq_s32(sum_0, round_vec_0);
- sum_0 = vrshlq_s32(sum_0, round_bits);
-
- sum_0 = vminq_s32(sum_0, round_vec_1);
- res = vqmovun_s32(sum_0);
- return res;
-}
-
-static INLINE int16x8_t convolve8_8x8_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
- const int16x8_t horiz_const, const int16x8_t shift_round_0) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x8_t sum;
-
- sum = horiz_const;
- sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
- sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
- sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
- sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
- sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
- sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
- sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
- sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-
- sum = vqrshlq_s16(sum, shift_round_0);
-
- return sum;
-}
-
-// clang versions < 16 did not include the dotprod feature for Arm architecture
-// versions that should have it by default, e.g., armv8.6-a.
-#if AOM_ARCH_AARCH64 && \
- (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
- 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
- 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
- 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x8_t convolve8_x_8_usdot(uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x3_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum[2];
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
- sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
- sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
- return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
-}
-
-static INLINE int16x8_t convolve8_horiz_8_usdot(uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x3_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum[2];
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
- sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
- sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- // We halved the convolution filter values so -1 from the right shift.
- return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
- vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x2_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[2];
- int32x4_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+static INLINE int32x4_t
+convolve12_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
- /* First 4 output values. */
- sum = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
- sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
- /* Narrowing and packing is performed by the caller. */
return sum;
}
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int16x8_t convolve8_horiz_8_sdot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum[2];
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
- sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- /* We halved the convolution filter values so -1 from the right shift. */
- return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
- vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+static INLINE uint8x8_t
+convolve12_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+ const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+ const int16x8_t sub_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+ int16x8_t res =
+ vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+ res = vsubq_s16(res, sub_const);
+
+ return vqmovun_s16(res);
}
-static INLINE int16x8_t convolve8_8_sdot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl,
- const int16x8_t shift_round_0) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
- sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- return vqrshlq_s16(sum, shift_round_0);
+static INLINE void convolve_2d_sr_vert_12tap_neon(
+ int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
+ int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int bd = 8;
+ const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+ if (w <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+ &s8, &s9, &s10);
+ src_ptr += 11 * src_stride;
+
+ do {
+ int16x4_t s11, s12, s13, s14;
+ load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
+
+ int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+ s10, s11, y_filter_0_7, y_filter_8_11);
+ int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, s12, y_filter_0_7, y_filter_8_11);
+ int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, s13, y_filter_0_7, y_filter_8_11);
+ int32x4_t d3 =
+ convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter_0_7, y_filter_8_11);
+
+ int16x8_t dd01 =
+ vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
+ int16x8_t dd23 =
+ vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
+
+ dd01 = vsubq_s16(dd01, sub_const);
+ dd23 = vsubq_s16(dd23, sub_const);
+
+ uint8x8_t d01 = vqmovun_s16(dd01);
+ uint8x8_t d23 = vqmovun_s16(dd23);
+
+ store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ } else {
+ do {
+ int height = h;
+ int16_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9, &s10);
+ s += 11 * src_stride;
+
+ do {
+ int16x8_t s11, s12, s13, s14;
+ load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
+
+ uint8x8_t d0 =
+ convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter_0_7, y_filter_8_11, sub_const);
+ uint8x8_t d1 =
+ convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter_0_7, y_filter_8_11, sub_const);
+ uint8x8_t d2 =
+ convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, y_filter_0_7, y_filter_8_11, sub_const);
+ uint8x8_t d3 =
+ convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, y_filter_0_7, y_filter_8_11, sub_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
}
-static INLINE int16x8_t convolve8_x_8_sdot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum[2];
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
- sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+static INLINE int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
}
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int16x4_t convolve8_4x4_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
- const int16x4_t horiz_const, const int16x4_t shift_round_0) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x4_t sum;
-
- sum = horiz_const;
- sum = vmla_lane_s16(sum, s0, filter_lo, 0);
- sum = vmla_lane_s16(sum, s1, filter_lo, 1);
- sum = vmla_lane_s16(sum, s2, filter_lo, 2);
- sum = vmla_lane_s16(sum, s3, filter_lo, 3);
- sum = vmla_lane_s16(sum, s4, filter_hi, 0);
- sum = vmla_lane_s16(sum, s5, filter_hi, 1);
- sum = vmla_lane_s16(sum, s6, filter_hi, 2);
- sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-
- sum = vqrshl_s16(sum, shift_round_0);
-
- return sum;
+static INLINE uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t y_filter,
+ const int16x8_t sub_const) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
+
+ int16x8_t res =
+ vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+ res = vsubq_s16(res, sub_const);
+
+ return vqmovun_s16(res);
}
-static INLINE int16x4_t convolve6_4x4(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x8_t y_filter_0_7) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
- int16x4_t sum;
-
- // Filter values at indices 0 and 7 are 0.
- sum = vmul_lane_s16(s0, y_filter_0_3, 1);
- sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
- sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
- sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
- sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
- sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
-
- return sum;
+static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
+ int src_stride,
+ uint8_t *dst_ptr,
+ int dst_stride, int w, int h,
+ const int16x8_t y_filter) {
+ const int bd = 8;
+ const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+ if (w <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ int16x4_t d3 =
+ convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+ uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+ store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+ int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint8x8_t d01 =
+ vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+
+ store_u8_4x1(dst_ptr, d01, 0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ // Width is a multiple of 8 and height is a multiple of 4.
+ do {
+ int height = h;
+ int16_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, sub_const);
+ uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, sub_const);
+ uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, sub_const);
+ uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, sub_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+ uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, sub_const);
+ vst1_u8(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
}
-static INLINE int16x8_t convolve6_8x4(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t y_filters) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filters);
- const int16x4_t y_filter_hi = vget_high_s16(y_filters);
- int16x8_t sum;
-
- // Filter values at indices 0 and 7 are 0.
- sum = vmulq_lane_s16(s0, y_filter_lo, 1);
- sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
- sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
- sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
- sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
- sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
-
- return sum;
+static INLINE int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
+
+ return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
}
-#if !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE int16x4_t convolve8_horiz_4x4_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
- const int16x4_t horiz_const) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x4_t sum;
-
- sum = horiz_const;
- sum = vmla_lane_s16(sum, s0, filter_lo, 0);
- sum = vmla_lane_s16(sum, s1, filter_lo, 1);
- sum = vmla_lane_s16(sum, s2, filter_lo, 2);
- sum = vmla_lane_s16(sum, s3, filter_lo, 3);
- sum = vmla_lane_s16(sum, s4, filter_hi, 0);
- sum = vmla_lane_s16(sum, s5, filter_hi, 1);
- sum = vmla_lane_s16(sum, s6, filter_hi, 2);
- sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vshr_n_s16(sum, ROUND0_BITS - 1);
+static INLINE uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter,
+ const int16x8_t sub_const) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
+
+ int16x8_t res =
+ vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+ res = vsubq_s16(res, sub_const);
+
+ return vqmovun_s16(res);
}
-static INLINE int16x8_t convolve8_horiz_8x8_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
- const int16x8_t horiz_const) {
- const int16x4_t filter_lo = vget_low_s16(filter);
- const int16x4_t filter_hi = vget_high_s16(filter);
- int16x8_t sum;
-
- sum = horiz_const;
- sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
- sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
- sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
- sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
- sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
- sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
- sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
- sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vshrq_n_s16(sum, ROUND0_BITS - 1);
+static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
+ int src_stride,
+ uint8_t *dst_ptr,
+ int dst_stride, int w, int h,
+ const int16x8_t y_filter) {
+ const int bd = 8;
+ const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+ if (w <= 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
+ int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter);
+ int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter);
+
+ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+ uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+ store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+ int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
+ uint8x8_t d01 =
+ vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+
+ store_u8_4x1(dst_ptr, d01, 0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ // Width is a multiple of 8 and height is a multiple of 4.
+ do {
+ int height = h;
+ int16_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint8x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
+ uint8x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const);
+ uint8x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const);
+ uint8x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+ uint8x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
+ vst1_u8(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
}
-#endif // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
new file mode 100644
index 000000000..ba8f7e74e
--- /dev/null
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
+ const int8x16_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum;
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
+ sum = vdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
+ sum = vdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+
+ return vqrshrn_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+ const int8x16_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples[2], permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
+ clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
+ // Second 4 output values.
+ sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filter, 0);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+
+ // Narrow and re-pack.
+ int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
+ vqrshrn_n_s32(sum[1], FILTER_BITS));
+ return vqmovun_s16(sum_s16);
+}
+
+static INLINE void convolve_x_sr_12tap_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filter_ptr) {
+ const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
+ const int8x16_t filter =
+ vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
+
+ const int32_t correction_s32 =
+ vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)),
+ vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS))));
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
+ // shift by FILTER_BITS - instead of a first rounding right shift by
+ // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+ // ROUND0_BITS.
+ int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1)));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ // Special case the following no-op filter as 128 won't fit into the
+ // 8-bit signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(filter_0_7, 5) == 128) {
+ // Undo the horizontal offset in the calling function.
+ src += 5;
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x8_t d0 = vld1_u8(s);
+ if (w == 4) {
+ store_u8_4x1(d, d0, 0);
+ } else {
+ vst1_u8(d, d0);
+ }
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
+
+ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+ uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ dst += 4 * dst_stride;
+ src += 4 * src_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ uint8x8_t d0 =
+ convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
+ uint8x8_t d1 =
+ convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
+ uint8x8_t d2 =
+ convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
+ uint8x8_t d3 =
+ convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
+
+ store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filter, 0);
+
+ // Packing is performed by the caller.
+ return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product. */
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+ // We halved the convolution filter values so - 1 from the right shift.
+ return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
+}
+
+void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ return;
+ }
+
+ const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+ src -= horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr);
+ return;
+ }
+
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+ // Dot product constants.
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+ // rounding right shift by FILTER_BITS - instead of a first rounding right
+ // shift by ROUND0_BITS, followed by second rounding right shift by
+ // FILTER_BITS - ROUND0_BITS.
+ // The outermost -1 is needed because we will halve the filter values.
+ const int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1)));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ // We halved the convolution filter values so - 1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ int width = w;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint8x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint8x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint8x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+ const int8x16_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum;
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+ sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+ sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+ // Narrow and re-pack.
+ return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+ const int8x16_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples[2], permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
+ clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+ // Second 4 output values.
+ sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+ // Narrow and re-pack.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
+ vshrn_n_s32(sum[1], ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11) {
+ const int bd = 8;
+
+ // Special case the following no-op filter as 128 won't fit into the 8-bit
+ // signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+ const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
+ // Undo the horizontal offset in the calling function.
+ src_ptr += 5;
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x8_t s0 = vld1_u8(s);
+ uint16x8_t d0 = vaddw_u8(horiz_const, s0);
+ d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
+ // Store 8 elements to avoid additional branches. This is safe if the
+ // actual block width is < 8 because the intermediate buffer is large
+ // enough to accommodate 128x128 blocks.
+ vst1q_s16(d, vreinterpretq_s16_u16(d0));
+
+ d += 8;
+ s += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ // Narrow filter values to 8-bit.
+ const int16x8x2_t x_filter_s16 = {
+ { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+ };
+ const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+ vmovn_s16(x_filter_s16.val[1]));
+
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ // Dot product constants.
+ const int32x4_t correct_tmp =
+ vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
+ vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
+ const int32x4_t correction =
+ vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit,
+ permute_tbl);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
+ range_limit, permute_tbl);
+ int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction,
+ range_limit, permute_tbl);
+ int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction,
+ range_limit, permute_tbl);
+ int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction,
+ range_limit, permute_tbl);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2];
+ s0[0] = vld1q_u8(s);
+ s0[1] = vld1q_u8(s + 4);
+ int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
+ range_limit, permute_tbl);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_neon_dotprod(
+ const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+ int im_h, const int16_t *x_filter_ptr) {
+ const int bd = 8;
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
+ // Dot product constants.
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const);
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
+ permute_tbl);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+ convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_0_7,
+ x_filter_8_11);
+
+ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_0_7, y_filter_8_11);
+ } else {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ convolve_2d_sr_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride,
+ w, im_h, x_filter_ptr);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ if (clamped_y_taps <= 6) {
+ convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ } else {
+ convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ }
+ }
+}
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
new file mode 100644
index 000000000..14140cafd
--- /dev/null
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
+ const int8x16_t filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum;
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+
+ return vqrshrn_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+ const int8x16_t filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
+ // Second 4 output values.
+ sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filter, 0);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+
+ // Narrow and re-pack.
+ int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
+ vqrshrn_n_s32(sum[1], FILTER_BITS));
+ return vqmovun_s16(sum_s16);
+}
+
+static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
+ int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const int16_t *x_filter_ptr) {
+ const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
+ const int8x16_t filter =
+ vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
+
+ // Special case the following no-op filter as 128 won't fit into the
+ // 8-bit signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(filter_0_7, 5) == 128) {
+ // Undo the horizontal offset in the calling function.
+ src += 5;
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x8_t d0 = vld1_u8(s);
+ if (w == 4) {
+ store_u8_4x1(d, d0, 0);
+ } else {
+ vst1_u8(d, d0);
+ }
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+ // right shift by FILTER_BITS - instead of a first rounding right shift by
+ // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+ // ROUND0_BITS.
+ const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
+
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
+
+ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+ uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ dst += 4 * dst_stride;
+ src += 4 * src_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
+ uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
+ uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
+ uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
+
+ store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
+ const uint8x16_t permute_tbl,
+ const int32x4_t horiz_const) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filter, 0);
+
+ // Packing is performed by the caller.
+ return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filter, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filter, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+
+ int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+ // We halved the convolution filter values so - 1 from the right shift.
+ return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
+}
+
+void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ return;
+ }
+
+ const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+ src -= horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr);
+ return;
+ }
+
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+ // rounding right shift by FILTER_BITS - instead of a first rounding right
+ // shift by ROUND0_BITS, followed by second rounding right shift by
+ // FILTER_BITS - ROUND0_BITS.
+ // The outermost -1 is needed because we will halve the filter values.
+ const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve4_4_x(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve4_4_x(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve4_4_x(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve4_4_x(s3, x_filter, permute_tbl, horiz_const);
+
+ // We halved the convolution filter values so - 1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
+ uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
+ uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
+ uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+ const int8x16_t filters,
+ const uint8x16x3_t permute_tbl,
+ int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum;
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+ // Narrow and re-pack.
+ return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+ const int8x16_t filters,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+ // Second 4 output values.
+ sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+ // Narrow and re-pack.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
+ vshrn_n_s32(sum[1], ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11) {
+ const int bd = 8;
+
+ // Special case the following no-op filter as 128 won't fit into the
+ // 8-bit signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+ const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
+ // Undo the horizontal offset in the calling function.
+ src_ptr += 5;
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x8_t s0 = vld1_u8(s);
+ uint16x8_t d0 = vaddw_u8(horiz_const, s0);
+ d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
+ // Store 8 elements to avoid additional branches. This is safe if the
+ // actual block width is < 8 because the intermediate buffer is large
+ // enough to accommodate 128x128 blocks.
+ vst1q_s16(d, vreinterpretq_s16_u16(d0));
+
+ d += 8;
+ s += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ // Narrow filter values to 8-bit.
+ const int16x8x2_t x_filter_s16 = {
+ { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+ };
+ const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+ vmovn_s16(x_filter_s16.val[1]));
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
+ const int32x4_t horiz_const =
+ vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 =
+ convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 =
+ convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 =
+ convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ int16x8_t d0 =
+ convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x8_t d1 =
+ convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x8_t d2 =
+ convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x8_t d3 =
+ convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2];
+ s0[0] = vld1q_u8(s);
+ s0[1] = vld1q_u8(s + 4);
+ int16x8_t d0 =
+ convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16_t permute_tbl,
+ const int32x4_t horiz_const) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_neon_i8mm(
+ const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+ int im_h, const int16_t *x_filter_ptr) {
+ const int bd = 8;
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+ convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_0_7,
+ x_filter_8_11);
+
+ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_0_7, y_filter_8_11);
+ } else {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ convolve_2d_sr_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w,
+ im_h, x_filter_ptr);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ if (clamped_y_taps <= 6) {
+ convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ } else {
+ convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ }
+ }
+}
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
new file mode 100644
index 000000000..dc3f8767e
--- /dev/null
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -0,0 +1,2031 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS
+
+static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
+ int src_stride, uint16_t *dst_ptr,
+ int dst_stride, int w, int h,
+ ConvolveParams *conv_params,
+ const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint16x4_t offset_vec = vdup_n_u16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint16x4_t avg = vhadd_u16(src, ref);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vhaddq_u16(s, r);
+ int32x4_t d0_lo =
+ vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+ int32x4_t d0_hi =
+ vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+ uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2),
+ vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2));
+ d0 = vminq_u16(d0, max);
+ vst1q_u16(dst, d0);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+ uint16_t *dst_ptr, int dst_stride,
+ int w, int h,
+ ConvolveParams *conv_params,
+ const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint16x4_t offset_vec = vdup_n_u16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint16x4_t avg = vhadd_u16(src, ref);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vhaddq_u16(s, r);
+ int32x4_t d0_lo =
+ vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+ int32x4_t d0_hi =
+ vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+ uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT),
+ vqrshrun_n_s32(d0_hi, ROUND_SHIFT));
+ d0 = vminq_u16(d0, max);
+ vst1q_u16(dst, d0);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_comp_avg_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint32x4_t offset_vec = vdupq_n_u32(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+ uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+ // Weighted averaging
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+ wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+ wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+ wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+ wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+ uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+ wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+ wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+ int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+ uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2),
+ vqrshrun_n_s32(d1, ROUND_SHIFT - 2));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst, d01);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_comp_avg_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint32x4_t offset_vec = vdupq_n_u32(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+ uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+ // Weighted averaging
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+ wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+ wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+ wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+ wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+ uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+ wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+ wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+ int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+ uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT),
+ vqrshrun_n_s32(d1, ROUND_SHIFT));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst, d01);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_12_convolve6_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x4_t
+highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_12_convolve6_8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
+ vqshrun_n_s32(sum1, ROUND0_BITS + 2));
+}
+
+static INLINE uint16x8_t
+highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+}
+
+static INLINE uint16x4_t highbd_12_convolve8_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+ const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter, const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_12_convolve8_8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+ const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
+ vqshrun_n_s32(sum1, ROUND0_BITS + 2));
+}
+
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter, const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
+ vqshrun_n_s32(sum1, ROUND0_BITS));
+}
+
+static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t offset) {
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+ return vqshrun_n_s32(sum, 5);
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t offset) {
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_x_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 2);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+ s0[6], s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
+ s1[6], s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
+ s2[6], s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
+ s3[6], s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 2);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+ s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
+ s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
+ s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
+ s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_x_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+ int dst16_stride = conv_params->dst_stride;
+ const int im_stride = MAX_SB_SIZE;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
+ (1 << (bd + FILTER_BITS)) +
+ (1 << (bd + FILTER_BITS - 1));
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ src -= horiz_offset;
+
+ // horizontal filter
+ if (bd == 12) {
+ if (conv_params->do_average) {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
+ im_stride, w, h, x_filter_ptr,
+ offset_convolve);
+ } else {
+ highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
+ w, h, x_filter_ptr, offset_convolve);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+ w, h, conv_params, offset_avg, bd);
+ } else {
+ highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, offset_avg, bd);
+ }
+ } else {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
+ dst16_stride, w, h,
+ x_filter_ptr, offset_convolve);
+ } else {
+ highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
+ w, h, x_filter_ptr, offset_convolve);
+ }
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
+ im_stride, w, h, x_filter_ptr,
+ offset_convolve);
+ } else {
+ highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
+ h, x_filter_ptr, offset_convolve);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+ h, conv_params, offset_avg, bd);
+ } else {
+ highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, offset_avg, bd);
+ }
+ } else {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
+ dst16_stride, w, h, x_filter_ptr,
+ offset_convolve);
+ } else {
+ highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
+ h, x_filter_ptr, offset_convolve);
+ }
+ }
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x4_t d1 =
+ highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x4_t d2 =
+ highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x4_t d3 =
+ highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x4_t d1 =
+ highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x4_t d2 =
+ highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x4_t d3 =
+ highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+static INLINE void highbd_dist_wtd_convolve_y_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_y_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ int dst16_stride = conv_params->dst_stride;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
+ (1 << (bd + FILTER_BITS)) +
+ (1 << (bd + FILTER_BITS - 1));
+
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ src -= vert_offset * src_stride;
+
+ if (bd == 12) {
+ if (conv_params->do_average) {
+ if (y_filter_taps <= 6) {
+ highbd_12_dist_wtd_convolve_y_6tap_neon(
+ src + src_stride, src_stride, im_block, im_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+ w, h, conv_params, round_offset_avg,
+ bd);
+ } else {
+ highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ }
+ } else {
+ if (y_filter_taps <= 6) {
+ highbd_12_dist_wtd_convolve_y_6tap_neon(
+ src + src_stride, src_stride, dst16, dst16_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_12_dist_wtd_convolve_y_8tap_neon(
+ src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (y_filter_taps <= 6) {
+ highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
+ im_block, im_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+ h, conv_params, round_offset_avg, bd);
+ } else {
+ highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ }
+ } else {
+ if (y_filter_taps <= 6) {
+ highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
+ dst16, dst16_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
+ dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ }
+ }
+}
+
+static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
+ uint16_t *dst_ptr, int dst_stride, int w,
+ int h, const int round_bits,
+ const int offset) {
+ if (w <= 4) {
+ const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
+ const uint16x4_t offset_u16 = vdup_n_u16(offset);
+
+ for (int y = 0; y < h; ++y) {
+ const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
+ uint16x4_t d = vshl_u16(s, round_shift_s16);
+ d = vadd_u16(d, offset_u16);
+ if (w == 2) {
+ store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+ } else {
+ vst1_u16(dst_ptr + y * dst_stride, d);
+ }
+ }
+ } else {
+ const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
+ const uint16x8_t offset_u16 = vdupq_n_u16(offset);
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; x += 8) {
+ const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
+ uint16x8_t d = vshlq_u16(s, round_shift_s16);
+ d = vaddq_u16(d, offset_u16);
+ vst1q_u16(dst_ptr + y * dst_stride + x, d);
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
+ int src_stride, uint16_t *dst,
+ int dst_stride, int w, int h,
+ ConvolveParams *conv_params,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+ const int im_stride = MAX_SB_SIZE;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+
+ if (conv_params->do_average) {
+ highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
+ round_offset);
+ } else {
+ highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
+ round_offset);
+ }
+
+ if (conv_params->do_average) {
+ if (conv_params->use_dist_wtd_comp_avg) {
+ if (bd == 12) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+ w, h, conv_params, round_offset, bd);
+ } else {
+ highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+ h, conv_params, round_offset, bd);
+ }
+ } else {
+ if (bd == 12) {
+ highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset, bd);
+ } else {
+ highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset, bd);
+ }
+ }
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x4_t d1 =
+ highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x4_t d2 =
+ highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x4_t d3 =
+ highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+ const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w <= 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6];
+ load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+ uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6];
+ load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 1);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ int16x4_t s0[4];
+ load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+ uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+ vst1_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+ s0[6], s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
+ s1[6], s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
+ s2[6], s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
+ s3[6], s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+
+ uint16x8_t d0 =
+ highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+ s0[6], s0[7], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 1);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ int16x4_t s0[4];
+ load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+ vst1_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+ s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
+ s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
+ s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
+ s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+ s0[7], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+ const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = clamped_x_taps / 2 - 1;
+ // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
+ // faster non-rounding non-saturating left shift.
+ const int round_offset_conv_x =
+ (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
+ const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset_conv_y = (1 << y_offset_bits);
+ const int round_offset_avg =
+ ((1 << (y_offset_bits - conv_params->round_1)) +
+ (1 << (y_offset_bits - conv_params->round_1 - 1)));
+
+ const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ // horizontal filter
+ if (bd == 12) {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ } else {
+ highbd_12_dist_wtd_convolve_2d_horiz_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ }
+ } else {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ } else {
+ highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ }
+ }
+
+ // vertical filter
+ if (y_filter_taps <= 6) {
+ if (conv_params->do_average) {
+ highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ } else {
+ highbd_dist_wtd_convolve_2d_vert_6tap_neon(
+ im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ }
+ } else {
+ if (conv_params->do_average) {
+ highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ } else {
+ highbd_dist_wtd_convolve_2d_vert_8tap_neon(
+ im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ }
+ }
+
+ // Do the compound averaging outside the loop, avoids branching within the
+ // main loop
+ if (conv_params->do_average) {
+ if (conv_params->use_dist_wtd_comp_avg) {
+ if (bd == 12) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
+ w, h, conv_params, round_offset_avg,
+ bd);
+ } else {
+ highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
+ h, conv_params, round_offset_avg, bd);
+ }
+ } else {
+ if (bd == 12) {
+ highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ } else {
+ highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ }
+ }
+ }
+}
diff --git a/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
new file mode 100644
index 000000000..51da025c3
--- /dev/null
+++ b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+#define UPSCALE_NORMATIVE_TAPS 8
+
+void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ int x0_qn, int x_step_qn, int bd) {
+ const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ static const int32_t kIdx[4] = { 0, 1, 2, 3 };
+ const int32x4_t idx = vld1q_s32(kIdx);
+ const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
+ const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
+ const int32x4_t offset_s32 = vdupq_n_s32(0);
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+
+ const uint16_t *src_ptr = src - horiz_offset;
+ uint16_t *dst_ptr = dst;
+
+ if (w <= 4) {
+ int height = h;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int x_qn = x0_qn;
+
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
+ // 2
+ const int32x4_t src_idx =
+ vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
+ // Similarly for the filter vector indices, we calculate the filter
+ // indices for 4 columns. First we calculate the indices:
+ // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
+ // Then we calculate the actual pointers, multiplying with
+ // UPSCALE_UPSCALE_NORMATIVE_TAPS
+ // again shift left by 1
+ const int32x4_t x_filter4_idx = vshlq_n_s32(
+ vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
+ // Even though pointers are unsigned 32/64-bit ints we do signed
+ // addition The reason for this is that x_qn can be negative, leading to
+ // negative offsets. Argon test
+ // profile0_core/streams/test10573_11003.obu was failing because of
+ // this.
+#if AOM_ARCH_AARCH64
+ uint64x2_t tmp4[2];
+ tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
+ vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
+ tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
+ vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+
+ // filter vectors
+ tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+ tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint64_t *)&x_filter4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+#else
+ uint32x4_t tmp4;
+ tmp4 = vreinterpretq_u32_s32(
+ vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+
+ // filter vectors
+ tmp4 = vreinterpretq_u32_s32(
+ vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
+ vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint32_t *)&x_filter4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+#endif // AOM_ARCH_AARCH64
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, max);
+
+ if (w == 2) {
+ store_u16_2x1(d + 0 * dst_stride, d0, 0);
+ } else {
+ vst1_u16(d + 0 * dst_stride, d0);
+ }
+
+ src_ptr += src_stride;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ int x_qn = x0_qn;
+ uint16_t *d = dst_ptr;
+ const uint16_t *s = src_ptr;
+
+ do {
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const int32x4_t xqn_idx =
+ vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
+ // = 2
+ const int32x4_t src_idx =
+ vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
+
+ // Similarly for the filter vector indices, we calculate the filter
+ // indices for 4 columns. First we calculate the indices:
+ // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
+ // Then we calculate the actual pointers, multiplying with
+ // UPSCALE_UPSCALE_NORMATIVE_TAPS
+ // again shift left by 1
+ const int32x4_t x_filter4_idx = vshlq_n_s32(
+ vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
+ 1);
+ // Even though pointers are unsigned 32/64-bit ints we do signed
+ // addition The reason for this is that x_qn can be negative, leading to
+ // negative offsets. Argon test
+ // profile0_core/streams/test10573_11003.obu was failing because of
+ // this.
+#if AOM_ARCH_AARCH64
+ uint64x2_t tmp4[2];
+ tmp4[0] = vreinterpretq_u64_s64(
+ vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
+ tmp4[1] = vreinterpretq_u64_s64(
+ vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+
+ // filter vectors
+ tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+ tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint64_t *)&x_filter4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+#else
+ uint32x4_t tmp4;
+ tmp4 = vreinterpretq_u32_s32(
+ vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+
+ // filter vectors
+ tmp4 = vreinterpretq_u32_s32(
+ vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
+ vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint32_t *)&x_filter4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+#endif // AOM_ARCH_AARCH64
+
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale X convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, max);
+ vst1_u16(d, d0);
+
+ x_qn += 4 * x_step_qn;
+ d += 4;
+ width -= 4;
+ } while (width > 0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ height--;
+ } while (height > 0);
+ }
+}
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index fb18e28ca..3f5ff9eaf 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -17,62 +17,87 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
#include "av1/common/convolve.h"
#include "av1/common/filter.h"
-#include "av1/common/arm/highbd_convolve_neon.h"
+
+static INLINE uint16x4_t
+highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
static INLINE void highbd_convolve_y_sr_6tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *y_filter_ptr, const int bd) {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
- const int32x4_t zero_s32 = vdupq_n_s32(0);
- if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
+ if (w == 4) {
const int16_t *s = (const int16_t *)(src_ptr + src_stride);
uint16_t *d = dst_ptr;
+ int16x4_t s0, s1, s2, s3, s4;
load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
s += 5 * src_stride;
do {
+ int16x4_t s5, s6, s7, s8;
load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
- d0 = highbd_convolve6_4_s32_s16(s0, s1, s2, s3, s4, s5, y_filter_0_7,
- zero_s32);
- d1 = highbd_convolve6_4_s32_s16(s1, s2, s3, s4, s5, s6, y_filter_0_7,
- zero_s32);
- d2 = highbd_convolve6_4_s32_s16(s2, s3, s4, s5, s6, s7, y_filter_0_7,
- zero_s32);
- d3 = highbd_convolve6_4_s32_s16(s3, s4, s5, s6, s7, s8, y_filter_0_7,
- zero_s32);
+ uint16x4_t d0 =
+ highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+ uint16x4_t d1 =
+ highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+ uint16x4_t d2 =
+ highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+ uint16x4_t d3 =
+ highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
- d01 = vcombine_u16(d0, d1);
- d23 = vcombine_u16(d2, d3);
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
-
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u16q_2x1(d + 2 * dst_stride, d23, 0);
- store_u16q_2x1(d + 3 * dst_stride, d23, 2);
- }
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- if (h != 2) {
- vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
- vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
- }
- }
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -82,42 +107,37 @@ static INLINE void highbd_convolve_y_sr_6tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
- // if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x8_t d0, d1, d2, d3;
-
+ // Width is a multiple of 8 and height is a multiple of 4.
do {
int height = h;
const int16_t *s = (const int16_t *)(src_ptr + src_stride);
uint16_t *d = dst_ptr;
+ int16x8_t s0, s1, s2, s3, s4;
load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
s += 5 * src_stride;
do {
+ int16x8_t s5, s6, s7, s8;
load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
- d0 = highbd_convolve6_8_s32_s16(s0, s1, s2, s3, s4, s5, y_filter_0_7,
- zero_s32);
- d1 = highbd_convolve6_8_s32_s16(s1, s2, s3, s4, s5, s6, y_filter_0_7,
- zero_s32);
- d2 = highbd_convolve6_8_s32_s16(s2, s3, s4, s5, s6, s7, y_filter_0_7,
- zero_s32);
- d3 = highbd_convolve6_8_s32_s16(s3, s4, s5, s6, s7, s8, y_filter_0_7,
- zero_s32);
+ uint16x8_t d0 =
+ highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+ uint16x8_t d1 =
+ highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+ uint16x8_t d2 =
+ highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+ uint16x8_t d3 =
+ highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -127,66 +147,96 @@ static INLINE void highbd_convolve_y_sr_6tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
+static INLINE uint16x4_t highbd_convolve8_4_y(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_y(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
static INLINE void highbd_convolve_y_sr_8tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *y_filter_ptr, int bd) {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
- const int32x4_t zero_s32 = vdupq_n_s32(0);
-
- if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
+ if (w == 4) {
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
do {
+ int16x4_t s7, s8, s9, s10;
load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
- d0 = highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- zero_s32);
- d1 = highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- zero_s32);
- d2 = highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- zero_s32);
- d3 = highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- zero_s32);
+ uint16x4_t d0 =
+ highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x4_t d1 =
+ highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x4_t d2 =
+ highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x4_t d3 =
+ highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
- d01 = vcombine_u16(d0, d1);
- d23 = vcombine_u16(d2, d3);
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
-
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u16q_2x1(d + 2 * dst_stride, d23, 0);
- store_u16q_2x1(d + 3 * dst_stride, d23, 2);
- }
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- if (h != 2) {
- vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
- vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
- }
- }
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -198,40 +248,36 @@ static INLINE void highbd_convolve_y_sr_8tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3;
do {
int height = h;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
do {
+ int16x8_t s7, s8, s9, s10;
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, zero_s32);
- d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, zero_s32);
- d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
- y_filter, zero_s32);
- d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
- y_filter, zero_s32);
+ uint16x8_t d0 =
+ highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x8_t d1 =
+ highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x8_t d2 =
+ highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x8_t d3 =
+ highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -243,71 +289,117 @@ static INLINE void highbd_convolve_y_sr_8tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
+static INLINE uint16x4_t highbd_convolve12_4_y(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_y(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+ const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
static INLINE void highbd_convolve_y_sr_12tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *y_filter_ptr, int bd) {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
- const int32x4_t zero_s32 = vdupq_n_s32(0);
-
- if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
+ if (w == 4) {
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
&s9, &s10);
s += 11 * src_stride;
do {
+ int16x4_t s11, s12, s13, s14;
load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
- d0 = highbd_convolve12_y_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
- s10, s11, y_filter_0_7, y_filter_8_11,
- zero_s32);
- d1 = highbd_convolve12_y_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9,
- s10, s11, s12, y_filter_0_7,
- y_filter_8_11, zero_s32);
- d2 = highbd_convolve12_y_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, s12, s13, y_filter_0_7,
- y_filter_8_11, zero_s32);
- d3 = highbd_convolve12_y_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, s13, s14, y_filter_0_7,
- y_filter_8_11, zero_s32);
-
- d01 = vcombine_u16(d0, d1);
- d23 = vcombine_u16(d2, d3);
-
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
-
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u16q_2x1(d + 2 * dst_stride, d23, 0);
- store_u16q_2x1(d + 3 * dst_stride, d23, 2);
- }
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- if (h != 2) {
- vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
- vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
- }
- }
+ uint16x4_t d0 =
+ highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, y_filter_0_7, y_filter_8_11);
+ uint16x4_t d1 =
+ highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, y_filter_0_7, y_filter_8_11);
+ uint16x4_t d2 =
+ highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, y_filter_0_7, y_filter_8_11);
+ uint16x4_t d3 =
+ highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, y_filter_0_7, y_filter_8_11);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -323,46 +415,41 @@ static INLINE void highbd_convolve_y_sr_12tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
- uint16x8_t d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-
do {
int height = h;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
&s9, &s10);
s += 11 * src_stride;
do {
+ int16x8_t s11, s12, s13, s14;
load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
- d0 = highbd_convolve12_y_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8,
- s9, s10, s11, y_filter_0_7,
- y_filter_8_11, zero_s32);
- d1 = highbd_convolve12_y_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9,
- s10, s11, s12, y_filter_0_7,
- y_filter_8_11, zero_s32);
- d2 = highbd_convolve12_y_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, s12, s13, y_filter_0_7,
- y_filter_8_11, zero_s32);
- d3 = highbd_convolve12_y_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11,
- s12, s13, s14, y_filter_0_7,
- y_filter_8_11, zero_s32);
+ uint16x8_t d0 =
+ highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, y_filter_0_7, y_filter_8_11);
+ uint16x8_t d1 =
+ highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, y_filter_0_7, y_filter_8_11);
+ uint16x8_t d2 =
+ highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, y_filter_0_7, y_filter_8_11);
+ uint16x8_t d3 =
+ highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, y_filter_0_7, y_filter_8_11);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -378,12 +465,12 @@ static INLINE void highbd_convolve_y_sr_12tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
@@ -391,6 +478,11 @@ void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_y,
const int subpel_y_qn, int bd) {
+ if (w == 2 || h == 2) {
+ av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, bd);
+ return;
+ }
const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
const int vert_offset = filter_params_y->taps / 2 - 1;
const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
@@ -413,207 +505,366 @@ void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
y_filter_ptr, bd);
}
-static INLINE void highbd_convolve_x_sr_8tap_neon(
+static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
+ const int16x8_t x_filter,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = offset;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
+
+ int32x4_t sum1 = offset;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE void highbd_convolve_x_sr_6tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
int bd) {
const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
- const int bits = FILTER_BITS - conv_params->round_0;
- const int16x8_t bits_s16 = vdupq_n_s16(-bits);
- const int32x4_t zero_s32 = vdupq_n_s32(0);
+ // This shim allows to do only one rounding shift instead of two.
+ const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
- if (w <= 4) {
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0, d1;
- uint16x8_t d01;
+ int height = h;
+ do {
+ int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
do {
- load_s16_8x2(s, src_stride, &s0, &s2);
- load_s16_8x2(s + 8, src_stride, &s1, &s3);
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset);
+ uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset);
+ uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset);
+ uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
- d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter, shift_s32,
- zero_s32);
- d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter, shift_s32,
- zero_s32);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+}
- d01 = vcombine_u16(d0, d1);
- d01 = vqrshlq_u16(d01, bits_s16);
- d01 = vminq_u16(d01, max);
+static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t offset) {
+ int32x4_t sum = offset;
+ sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- }
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
+ const int16x8_t x_filter,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = offset;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+
+ int32x4_t sum1 = offset;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
+ int src_stride, uint16_t *dst_ptr,
+ int dst_stride, int w, int h,
+ const int16_t *x_filter_ptr,
+ ConvolveParams *conv_params,
+ int bd) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ // This shim allows to do only one rounding shift instead of two.
+ const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 2);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset);
+ uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset);
+ uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset);
+ uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset);
- s += 2 * src_stride;
- d += 2 * dst_stride;
- h -= 2;
- } while (h > 0);
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
} else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
int height = h;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t d0, d1, d2, d3;
+
do {
int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
- load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
- s += 8;
-
do {
- load_s16_8x4(s, src_stride, &s1, &s3, &s5, &s7);
-
- d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter, shift_s32,
- zero_s32);
- d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter, shift_s32,
- zero_s32);
- d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter, shift_s32,
- zero_s32);
- d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter, shift_s32,
- zero_s32);
-
- d0 = vqrshlq_u16(d0, bits_s16);
- d1 = vqrshlq_u16(d1, bits_s16);
- d2 = vqrshlq_u16(d2, bits_s16);
- d3 = vqrshlq_u16(d3, bits_s16);
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset);
+ uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset);
+ uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset);
+ uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- s0 = s1;
- s2 = s3;
- s4 = s5;
- s6 = s7;
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
+ } while (width != 0);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
}
}
+static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum = offset;
+ sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum0 = offset;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
+
+ int32x4_t sum1 = offset;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
static INLINE void highbd_convolve_x_sr_12tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
int bd) {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
- const int bits = FILTER_BITS - conv_params->round_0;
- const int16x8_t bits_s16 = vdupq_n_s16(-bits);
+ // This shim allows to do only one rounding shift instead of two.
+ const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
- const int32x4_t zero_s32 = vdupq_n_s32(0);
-
- if (w <= 4) {
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0, d1;
- uint16x8_t d01;
+ if (w == 4) {
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
do {
- load_s16_8x2(s, src_stride, &s0, &s2);
- load_s16_8x2(s + 8, src_stride, &s1, &s3);
-
- d0 = highbd_convolve12_horiz4_s32_s16(s0, s1, x_filter_0_7, x_filter_8_11,
- shift_s32, zero_s32);
- d1 = highbd_convolve12_horiz4_s32_s16(s2, s3, x_filter_0_7, x_filter_8_11,
- shift_s32, zero_s32);
-
- d01 = vcombine_u16(d0, d1);
- d01 = vqrshlq_u16(d01, bits_s16);
- d01 = vminq_u16(d01, max);
+ int16x4_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x4_t d0 =
+ highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset);
+ uint16x4_t d1 =
+ highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset);
+ uint16x4_t d2 =
+ highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset);
+ uint16x4_t d3 =
+ highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- }
-
- s += 2 * src_stride;
- d += 2 * dst_stride;
- h -= 2;
- } while (h > 0);
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
} else {
int height = h;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
- uint16x8_t d0, d1, d2, d3;
+
do {
int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
- load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
- s += 8;
-
do {
- load_s16_8x4(s, src_stride, &s1, &s4, &s7, &s10);
- load_s16_8x4(s + 8, src_stride, &s2, &s5, &s8, &s11);
-
- d0 = highbd_convolve12_horiz8_s32_s16(
- s0, s1, s2, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
- d1 = highbd_convolve12_horiz8_s32_s16(
- s3, s4, s5, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
- d2 = highbd_convolve12_horiz8_s32_s16(
- s6, s7, s8, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
- d3 = highbd_convolve12_horiz8_s32_s16(
- s9, s10, s11, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
-
- d0 = vqrshlq_u16(d0, bits_s16);
- d1 = vqrshlq_u16(d1, bits_s16);
- d2 = vqrshlq_u16(d2, bits_s16);
- d3 = vqrshlq_u16(d3, bits_s16);
+ int16x8_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x8_t d0 =
+ highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset);
+ uint16x8_t d1 =
+ highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset);
+ uint16x8_t d2 =
+ highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset);
+ uint16x8_t d3 =
+ highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
-
- s0 = s1;
- s1 = s2;
- s3 = s4;
- s4 = s5;
- s6 = s7;
- s7 = s8;
- s9 = s10;
- s10 = s11;
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
+ } while (width != 0);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
}
}
@@ -622,6 +873,11 @@ void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
const InterpFilterParams *filter_params_x,
const int subpel_x_qn,
ConvolveParams *conv_params, int bd) {
+ if (w == 2 || h == 2) {
+ av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params, bd);
+ return;
+ }
const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
const int horiz_offset = filter_params_x->taps / 2 - 1;
const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
@@ -634,70 +890,250 @@ void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
x_filter_ptr, conv_params, bd);
return;
}
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr, conv_params, bd);
+ return;
+ }
+
+ highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr, conv_params, bd);
+}
- highbd_convolve_x_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h,
- x_filter_ptr, conv_params, bd);
+static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter, const int32x4_t round_shift,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ sum = vshlq_s32(sum, round_shift);
+ return vqmovun_s32(sum);
}
-static INLINE void highbd_convolve_2d_y_sr_8tap_neon(
+static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter, const int32x4_t round_shift,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ sum0 = vshlq_s32(sum0, round_shift);
+ sum1 = vshlq_s32(sum1, round_shift);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
- int bd, const int offset, const int correction) {
+ int bd, const int offset) {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
const int32x4_t offset_s32 = vdupq_n_s32(offset);
const int round1_shift = conv_params->round_1;
const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
- const int32x4_t correction_s32 = vdupq_n_s32(correction);
- if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
+ round1_shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 = highbd_convolve6_8_2d_v(
+ s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32);
+ uint16x8_t d1 = highbd_convolve6_8_2d_v(
+ s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32);
+ uint16x8_t d2 = highbd_convolve6_8_2d_v(
+ s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32);
+ uint16x8_t d3 = highbd_convolve6_8_2d_v(
+ s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ sum = vshlq_s32(sum, round_shift);
+ return vqmovun_s32(sum);
+}
+static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
+
+ sum0 = vshlq_s32(sum0, round_shift);
+ sum1 = vshlq_s32(sum1, round_shift);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
+ int bd, const int offset) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+ const int round1_shift = conv_params->round_1;
+ const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
+
+ if (w == 4) {
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
do {
+ int16x4_t s7, s8, s9, s10;
load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
- d0 = highbd_convolve8_4_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round1_shift_s32, offset_s32,
- correction_s32);
- d1 = highbd_convolve8_4_sr_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, round1_shift_s32, offset_s32,
- correction_s32);
- d2 = highbd_convolve8_4_sr_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
- y_filter, round1_shift_s32, offset_s32,
- correction_s32);
- d3 = highbd_convolve8_4_sr_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
- y_filter, round1_shift_s32, offset_s32,
- correction_s32);
-
- d01 = vcombine_u16(d0, d1);
- d23 = vcombine_u16(d2, d3);
-
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
-
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u16q_2x1(d + 2 * dst_stride, d23, 0);
- store_u16q_2x1(d + 3 * dst_stride, d23, 2);
- }
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- if (h != 2) {
- vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
- vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
- }
- }
+ uint16x4_t d0 =
+ highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d1 =
+ highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d2 =
+ highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d3 =
+ highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round1_shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -709,44 +1145,40 @@ static INLINE void highbd_convolve_2d_y_sr_8tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3;
do {
int height = h;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
do {
+ int16x8_t s7, s8, s9, s10;
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- d0 = highbd_convolve8_8_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round1_shift_s32,
- offset_s32, correction_s32);
- d1 = highbd_convolve8_8_sr_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, round1_shift_s32,
- offset_s32, correction_s32);
- d2 = highbd_convolve8_8_sr_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
- y_filter, round1_shift_s32,
- offset_s32, correction_s32);
- d3 = highbd_convolve8_8_sr_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
- y_filter, round1_shift_s32,
- offset_s32, correction_s32);
+ uint16x8_t d0 =
+ highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x8_t d1 =
+ highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x8_t d2 =
+ highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x8_t d3 =
+ highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round1_shift_s32, offset_s32);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -758,75 +1190,126 @@ static INLINE void highbd_convolve_2d_y_sr_8tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
-static INLINE void highbd_convolve_2d_y_sr_12tap_neon(
+static INLINE uint16x4_t highbd_convolve12_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+ sum = vshlq_s32(sum, round_shift);
+ return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+ const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+ sum0 = vshlq_s32(sum0, round_shift);
+ sum1 = vshlq_s32(sum1, round_shift);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
- const int bd, const int offset, const int correction) {
+ const int bd, const int offset) {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
const int32x4_t offset_s32 = vdupq_n_s32(offset);
const int round1_shift = conv_params->round_1;
const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
- const int32x4_t correction_s32 = vdupq_n_s32(correction);
-
- if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
+ if (w == 4) {
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
&s9, &s10);
s += 11 * src_stride;
do {
+ int16x4_t s11, s12, s13, s14;
load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
- d0 = highbd_convolve12_y_4_sr_s32_s16(
+ uint16x4_t d0 = highbd_convolve12_4_2d_v(
s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
- d1 = highbd_convolve12_y_4_sr_s32_s16(
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x4_t d1 = highbd_convolve12_4_2d_v(
s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
- d2 = highbd_convolve12_y_4_sr_s32_s16(
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x4_t d2 = highbd_convolve12_4_2d_v(
s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
- d3 = highbd_convolve12_y_4_sr_s32_s16(
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x4_t d3 = highbd_convolve12_4_2d_v(
s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
+ y_filter_8_11, round1_shift_s32, offset_s32);
- d01 = vcombine_u16(d0, d1);
- d23 = vcombine_u16(d2, d3);
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
-
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u16q_2x1(d + 2 * dst_stride, d23, 0);
- store_u16q_2x1(d + 3 * dst_stride, d23, 2);
- }
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- if (h != 2) {
- vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
- vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
- }
- }
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -842,46 +1325,41 @@ static INLINE void highbd_convolve_2d_y_sr_12tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
- } while (h > 0);
+ } while (h != 0);
} else {
- uint16x8_t d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-
do {
int height = h;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
&s9, &s10);
s += 11 * src_stride;
do {
+ int16x8_t s11, s12, s13, s14;
load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
- d0 = highbd_convolve12_y_8_sr_s32_s16(
+ uint16x8_t d0 = highbd_convolve12_8_2d_v(
s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
- d1 = highbd_convolve12_y_8_sr_s32_s16(
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x8_t d1 = highbd_convolve12_8_2d_v(
s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
- d2 = highbd_convolve12_y_8_sr_s32_s16(
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x8_t d2 = highbd_convolve12_8_2d_v(
s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
- d3 = highbd_convolve12_y_8_sr_s32_s16(
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x8_t d3 = highbd_convolve12_8_2d_v(
s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
- y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
+ y_filter_8_11, round1_shift_s32, offset_s32);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -897,1485 +1375,746 @@ static INLINE void highbd_convolve_2d_y_sr_12tap_neon(
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
- } while (height > 0);
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
-static INLINE void highbd_convolve_x_8tap_neon(
+static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
+ const int16x8_t x_filter,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
+
+ sum0 = vqrshlq_s32(sum0, shift_s32);
+ sum1 = vqrshlq_s32(sum1, shift_s32);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
const int offset) {
- const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ // The smallest block height processed by the SIMD functions is 4, and the
+ // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+ // for the vertical convolution.
+ assert(h >= 5);
const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
const int32x4_t offset_s32 = vdupq_n_s32(offset);
- if (w <= 4) {
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0, d1;
- uint16x8_t d01;
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+ do {
+ int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
do {
- load_s16_8x2(s, src_stride, &s0, &s2);
- load_s16_8x2(s + 8, src_stride, &s1, &s3);
-
- d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter, shift_s32,
- offset_s32);
- d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter, shift_s32,
- offset_s32);
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 =
+ highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+ uint16x8_t d1 =
+ highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32);
+ uint16x8_t d2 =
+ highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32);
+ uint16x8_t d3 =
+ highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- d01 = vcombine_u16(d0, d1);
-
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- }
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
- s += 2 * src_stride;
- d += 2 * dst_stride;
- h -= 2;
- } while (h > 0);
- } else {
- int height = h;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t d0, d1, d2, d3;
do {
- int width = w;
- const int16_t *s = (const int16_t *)src_ptr;
- uint16_t *d = dst_ptr;
+ int16x8_t s0[6];
+ load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+ uint16x8_t d0 =
+ highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+ vst1q_u16(d, d0);
- load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+}
- do {
- load_s16_8x4(s, src_stride, &s1, &s3, &s5, &s7);
-
- d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter, shift_s32,
- offset_s32);
- d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter, shift_s32,
- offset_s32);
- d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter, shift_s32,
- offset_s32);
- d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter, shift_s32,
- offset_s32);
-
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
-
- s0 = s1;
- s2 = s3;
- s4 = s5;
- s6 = s7;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- }
+static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+ sum = vqrshlq_s32(sum, shift_s32);
+ return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
+ const int16x8_t x_filter,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+
+ sum0 = vqrshlq_s32(sum0, shift_s32);
+ sum1 = vqrshlq_s32(sum1, shift_s32);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
}
-static INLINE void highbd_convolve_2d_x_sr_12tap_neon(
+static INLINE void highbd_convolve_2d_sr_horiz_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
const int offset) {
+ // The smallest block height processed by the SIMD functions is 4, and the
+ // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+ // for the vertical convolution.
+ assert(h >= 5);
const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
- const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
- const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
const int32x4_t offset_s32 = vdupq_n_s32(offset);
- if (w <= 4) {
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0, d1;
- uint16x8_t d01;
-
- const int16_t *s = (const int16_t *)src_ptr;
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 1);
uint16_t *d = dst_ptr;
do {
- load_s16_8x2(s, src_stride, &s0, &s2);
- load_s16_8x2(s + 8, src_stride, &s1, &s3);
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 =
+ highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
+ uint16x4_t d1 =
+ highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32);
+ uint16x4_t d2 =
+ highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32);
+ uint16x4_t d3 =
+ highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
- d0 = highbd_convolve12_horiz4_s32_s16(s0, s1, x_filter_0_7, x_filter_8_11,
- shift_s32, offset_s32);
- d1 = highbd_convolve12_horiz4_s32_s16(s2, s3, x_filter_0_7, x_filter_8_11,
- shift_s32, offset_s32);
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
- d01 = vcombine_u16(d0, d1);
+ do {
+ int16x4_t s0[4];
+ load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- }
+ uint16x4_t d0 =
+ highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
+
+ vst1_u16(d, d0);
- s += 2 * src_stride;
- d += 2 * dst_stride;
- h -= 2;
- } while (h > 0);
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
} else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
int height = h;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
- uint16x8_t d0, d1, d2, d3;
+
do {
int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
- load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
- s += 8;
-
do {
- load_s16_8x4(s, src_stride, &s1, &s4, &s7, &s10);
- load_s16_8x4(s + 8, src_stride, &s2, &s5, &s8, &s11);
-
- d0 = highbd_convolve12_horiz8_s32_s16(
- s0, s1, s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
- d1 = highbd_convolve12_horiz8_s32_s16(
- s3, s4, s5, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
- d2 = highbd_convolve12_horiz8_s32_s16(
- s6, s7, s8, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
- d3 = highbd_convolve12_horiz8_s32_s16(
- s9, s10, s11, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
-
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
-
- s0 = s1;
- s1 = s2;
- s3 = s4;
- s4 = s5;
- s6 = s7;
- s7 = s8;
- s9 = s10;
- s10 = s11;
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+ uint16x8_t d1 =
+ highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32);
+ uint16x8_t d2 =
+ highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32);
+ uint16x8_t d3 =
+ highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
+ } while (width != 0);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
- } while (height > 0);
- }
-}
-
-void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_qn,
- const int subpel_y_qn,
- ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(16, uint16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
- const int im_h = h + filter_params_y->taps - 1;
- const int im_stride = MAX_SB_SIZE;
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
- const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int y_offset_initial = (1 << y_offset_bits);
- const int y_offset_correction =
- ((1 << (y_offset_bits - conv_params->round_1)) +
- (1 << (y_offset_bits - conv_params->round_1 - 1)));
-
- const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
- if (filter_params_x->taps > 8) {
- highbd_convolve_2d_x_sr_12tap_neon(src_ptr, src_stride, im_block, im_stride,
- w, im_h, x_filter_ptr, conv_params,
- x_offset_initial);
-
- highbd_convolve_2d_y_sr_12tap_neon(im_block, im_stride, dst, dst_stride, w,
- h, y_filter_ptr, conv_params, bd,
- y_offset_initial, y_offset_correction);
- } else {
- highbd_convolve_x_8tap_neon(src_ptr, src_stride, im_block, im_stride, w,
- im_h, x_filter_ptr, conv_params,
- x_offset_initial);
-
- highbd_convolve_2d_y_sr_8tap_neon(im_block, im_stride, dst, dst_stride, w,
- h, y_filter_ptr, conv_params, bd,
- y_offset_initial, y_offset_correction);
- }
-}
-
-static INLINE void highbd_convolve_2d_x_scale_8tap_neon(
- const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
- int w, int h, const int subpel_x_qn, const int x_step_qn,
- const InterpFilterParams *filter_params, ConvolveParams *conv_params,
- const int offset) {
- const uint32x4_t idx = { 0, 1, 2, 3 };
- const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK);
- const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
- const int32x4_t offset_s32 = vdupq_n_s32(offset);
-
- if (w <= 4) {
- int height = h;
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0;
-
- uint16_t *d = dst_ptr;
-
- do {
- int x_qn = subpel_x_qn;
-
- // Load 4 src vectors at a time, they might be the same, but we have to
- // calculate the indices anyway. Doing it in SIMD and then storing the
- // indices is faster than having to calculate the expression
- // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
- // Ideally this should be a gather using the indices, but NEON does not
- // have that, so have to emulate
- const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
- // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
- // 2
- const uint32x4_t src_idx_u32 =
- vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
-#if AOM_ARCH_AARCH64
- uint64x2_t src4[2];
- src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
- vget_low_u32(src_idx_u32));
- src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
- vget_high_u32(src_idx_u32));
- int16_t *src4_ptr[4];
- uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
- vst1q_u64(tmp_ptr, src4[0]);
- vst1q_u64(tmp_ptr + 2, src4[1]);
-#else
- uint32x4_t src4;
- src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32);
- int16_t *src4_ptr[4];
- uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
- vst1q_u32(tmp_ptr, src4);
-#endif // AOM_ARCH_AARCH64
- // Same for the filter vectors
- const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
- vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
- int32_t x_filter4_idx[4];
- vst1q_s32(x_filter4_idx, filter_idx_s32);
- const int16_t *x_filter4_ptr[4];
-
- // Load source
- s0 = vld1q_s16(src4_ptr[0]);
- s1 = vld1q_s16(src4_ptr[1]);
- s2 = vld1q_s16(src4_ptr[2]);
- s3 = vld1q_s16(src4_ptr[3]);
-
- // We could easily do this using SIMD as well instead of calling the
- // inline function 4 times.
- x_filter4_ptr[0] =
- av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]);
- x_filter4_ptr[1] =
- av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]);
- x_filter4_ptr[2] =
- av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]);
- x_filter4_ptr[3] =
- av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]);
-
- // Actually load the filters
- const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
- const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
- const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
- const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
-
- // Group low and high parts and transpose
- int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
- vget_low_s16(x_filter1),
- vget_low_s16(x_filter2),
- vget_low_s16(x_filter3) };
- int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
- vget_high_s16(x_filter1),
- vget_high_s16(x_filter2),
- vget_high_s16(x_filter3) };
- transpose_u16_4x4((uint16x4_t *)filters_lo);
- transpose_u16_4x4((uint16x4_t *)filters_hi);
-
- // Run the 2D Scale convolution
- d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
- s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
-
- if (w == 2) {
- store_u16_2x1(d + 0 * dst_stride, d0, 0);
- } else {
- vst1_u16(d + 0 * dst_stride, d0);
- }
-
- src_ptr += src_stride;
- d += dst_stride;
- height--;
- } while (height > 0);
- } else {
- int height = h;
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0;
+ } while (height > 4);
do {
int width = w;
- int x_qn = subpel_x_qn;
+ const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
- const uint16_t *s = src_ptr;
do {
- // Load 4 src vectors at a time, they might be the same, but we have to
- // calculate the indices anyway. Doing it in SIMD and then storing the
- // indices is faster than having to calculate the expression
- // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
- // Ideally this should be a gather using the indices, but NEON does not
- // have that, so have to emulate
- const uint32x4_t xqn_idx =
- vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
- // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
- // = 2
- const uint32x4_t src_idx_u32 =
- vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
-#if AOM_ARCH_AARCH64
- uint64x2_t src4[2];
- src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
- vget_low_u32(src_idx_u32));
- src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
- vget_high_u32(src_idx_u32));
- int16_t *src4_ptr[4];
- uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
- vst1q_u64(tmp_ptr, src4[0]);
- vst1q_u64(tmp_ptr + 2, src4[1]);
-#else
- uint32x4_t src4;
- src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32);
- int16_t *src4_ptr[4];
- uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
- vst1q_u32(tmp_ptr, src4);
-#endif // AOM_ARCH_AARCH64
- // Same for the filter vectors
- const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
- vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
- int32_t x_filter4_idx[4];
- vst1q_s32(x_filter4_idx, filter_idx_s32);
- const int16_t *x_filter4_ptr[4];
-
- // Load source
- s0 = vld1q_s16(src4_ptr[0]);
- s1 = vld1q_s16(src4_ptr[1]);
- s2 = vld1q_s16(src4_ptr[2]);
- s3 = vld1q_s16(src4_ptr[3]);
-
- // We could easily do this using SIMD as well instead of calling the
- // inline function 4 times.
- x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel(
- filter_params, x_filter4_idx[0]);
- x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel(
- filter_params, x_filter4_idx[1]);
- x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel(
- filter_params, x_filter4_idx[2]);
- x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel(
- filter_params, x_filter4_idx[3]);
-
- // Actually load the filters
- const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
- const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
- const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
- const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
-
- // Group low and high parts and transpose
- int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
- vget_low_s16(x_filter1),
- vget_low_s16(x_filter2),
- vget_low_s16(x_filter3) };
- int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
- vget_high_s16(x_filter1),
- vget_high_s16(x_filter2),
- vget_high_s16(x_filter3) };
- transpose_u16_4x4((uint16x4_t *)filters_lo);
- transpose_u16_4x4((uint16x4_t *)filters_hi);
-
- // Run the 2D Scale X convolution
- d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
- s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
-
- vst1_u16(d, d0);
-
- x_qn += 4 * x_step_qn;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
- }
-}
-
-static INLINE void highbd_convolve_2d_y_scale_8tap_neon(
- const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
- int w, int h, const int subpel_y_qn, const int y_step_qn,
- const InterpFilterParams *filter_params, const int round1_bits,
- const int offset) {
- const int32x4_t offset_s32 = vdupq_n_s32(1 << offset);
-
- const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits);
- if (w <= 4) {
- int height = h;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t d0;
-
- uint16_t *d = dst_ptr;
-
- int y_qn = subpel_y_qn;
- do {
- const int16_t *s =
- (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+ int16x8_t s0[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
- load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- const int16_t *y_filter_ptr =
- av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-
- d0 = highbd_convolve8_4_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round1_shift_s32, offset_s32,
- vdupq_n_s32(0));
-
- if (w == 2) {
- store_u16_2x1(d, d0, 0);
- } else {
- vst1_u16(d, d0);
- }
-
- y_qn += y_step_qn;
- d += dst_stride;
- height--;
- } while (height > 0);
- } else {
- int width = w;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t d0;
-
- do {
- int height = h;
- int y_qn = subpel_y_qn;
-
- uint16_t *d = dst_ptr;
-
- do {
- const int16_t *s =
- (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
- load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- const int16_t *y_filter_ptr =
- av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-
- d0 = highbd_convolve8_8_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round1_shift_s32,
- offset_s32, vdupq_n_s32(0));
+ uint16x8_t d0 =
+ highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
vst1q_u16(d, d0);
- y_qn += y_step_qn;
- d += dst_stride;
- height--;
- } while (height > 0);
- src_ptr += 8;
- dst_ptr += 8;
- width -= 8;
- } while (width > 0);
- }
-}
-
-static INLINE void highbd_dist_wtd_comp_avg_neon(
- const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
- int w, int h, ConvolveParams *conv_params, const int round_bits,
- const int offset, const int bd) {
- CONV_BUF_TYPE *dst16 = conv_params->dst;
- const int dst16_stride = conv_params->dst_stride;
- const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
- const int16x4_t offset_s16 = vdup_n_s16(offset);
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- uint16x4_t fwd_offset_u16 = vdup_n_u16(conv_params->fwd_offset);
- uint16x4_t bck_offset_u16 = vdup_n_u16(conv_params->bck_offset);
-
- // Weighted averaging
- if (w <= 4) {
- for (int y = 0; y < h; ++y) {
- const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
- const uint16x4_t d16 = vld1_u16(dst16 + y * dst16_stride);
- // We use vmull_u16/vmlal_u16 instead of of vmull_s16/vmlal_s16
- // because the latter sign-extend and the values are non-negative.
- // However, d0/d1 are signed-integers and we use vqmovun
- // to do saturated narrowing to unsigned.
- int32x4_t d0 = vreinterpretq_s32_u32(vmull_u16(d16, fwd_offset_u16));
- d0 = vreinterpretq_s32_u32(
- vmlal_u16(vreinterpretq_u32_s32(d0), s, bck_offset_u16));
- d0 = vshrq_n_s32(d0, DIST_PRECISION_BITS);
- // Subtract round offset and convolve round
- d0 = vqrshlq_s32(vsubw_s16(d0, offset_s16), round_shift_s32);
- uint16x4_t d = vqmovun_s32(d0);
- d = vmin_u16(d, vget_low_u16(max));
- if (w == 2) {
- store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
- } else {
- vst1_u16(dst_ptr + y * dst_stride, d);
- }
- }
- } else {
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; x += 8) {
- const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
- const uint16x8_t d16 = vld1q_u16(dst16 + y * dst16_stride + x);
- // We use vmull_u16/vmlal_u16 instead of of vmull_s16/vmlal_s16
- // because the latter sign-extend and the values are non-negative.
- // However, d0/d1 are signed-integers and we use vqmovun
- // to do saturated narrowing to unsigned.
- int32x4_t d0 =
- vreinterpretq_s32_u32(vmull_u16(vget_low_u16(d16), fwd_offset_u16));
- int32x4_t d1 = vreinterpretq_s32_u32(
- vmull_u16(vget_high_u16(d16), fwd_offset_u16));
- d0 = vreinterpretq_s32_u32(vmlal_u16(vreinterpretq_u32_s32(d0),
- vget_low_u16(s), bck_offset_u16));
- d1 = vreinterpretq_s32_u32(vmlal_u16(vreinterpretq_u32_s32(d1),
- vget_high_u16(s), bck_offset_u16));
- d0 = vshrq_n_s32(d0, DIST_PRECISION_BITS);
- d1 = vshrq_n_s32(d1, DIST_PRECISION_BITS);
- d0 = vqrshlq_s32(vsubw_s16(d0, offset_s16), round_shift_s32);
- d1 = vqrshlq_s32(vsubw_s16(d1, offset_s16), round_shift_s32);
- uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
- d01 = vminq_u16(d01, max);
- vst1q_u16(dst_ptr + y * dst_stride + x, d01);
- }
- }
- }
-}
-
-static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
- uint16_t *dst_ptr, int dst_stride,
- int w, int h,
- ConvolveParams *conv_params,
- const int round_bits, const int offset,
- const int bd) {
- CONV_BUF_TYPE *dst16 = conv_params->dst;
- const int dst16_stride = conv_params->dst_stride;
- const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
- const int16x4_t offset_s16 = vdup_n_s16(offset);
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
- if (w <= 4) {
- for (int y = 0; y < h; ++y) {
- const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
- const uint16x4_t d16 = vld1_u16(dst16 + y * dst16_stride);
- int32x4_t s_s32 = vreinterpretq_s32_u32(vmovl_u16(s));
- int32x4_t d16_s32 = vreinterpretq_s32_u32(vmovl_u16(d16));
- int32x4_t d0 = vhaddq_s32(s_s32, d16_s32);
- d0 = vsubw_s16(d0, offset_s16);
- d0 = vqrshlq_s32(d0, round_shift_s32);
- uint16x4_t d = vqmovun_s32(d0);
- d = vmin_u16(d, vget_low_u16(max));
- if (w == 2) {
- store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
- } else {
- vst1_u16(dst_ptr + y * dst_stride, d);
- }
- }
- } else {
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; x += 8) {
- const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
- const uint16x8_t d16 = vld1q_u16(dst16 + y * dst16_stride + x);
- int32x4_t s_lo = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(s)));
- int32x4_t s_hi = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(s)));
- int32x4_t d16_lo = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(d16)));
- int32x4_t d16_hi = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(d16)));
- int32x4_t d0 = vhaddq_s32(s_lo, d16_lo);
- int32x4_t d1 = vhaddq_s32(s_hi, d16_hi);
- d0 = vsubw_s16(d0, offset_s16);
- d1 = vsubw_s16(d1, offset_s16);
- d0 = vqrshlq_s32(d0, round_shift_s32);
- d1 = vqrshlq_s32(d1, round_shift_s32);
- uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
- d01 = vminq_u16(d01, max);
- vst1q_u16(dst_ptr + y * dst_stride + x, d01);
- }
- }
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
}
}
-static INLINE void highbd_convolve_correct_offset_neon(
- const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
- int w, int h, const int round_bits, const int offset, const int bd) {
- const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
- const int16x4_t offset_s16 = vdup_n_s16(offset);
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
- if (w <= 4) {
- for (int y = 0; y < h; ++y) {
- const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride);
- const int32x4_t d0 =
- vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32);
- uint16x4_t d = vqmovun_s32(d0);
- d = vmin_u16(d, vget_low_u16(max));
- if (w == 2) {
- store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
- } else {
- vst1_u16(dst_ptr + y * dst_stride, d);
- }
- }
- } else {
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; x += 8) {
- // Subtract round offset and convolve round
- const int16x8_t s =
- vld1q_s16((const int16_t *)src_ptr + y * src_stride + x);
- const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16),
- round_shift_s32);
- const int32x4_t d1 = vqrshlq_s32(
- vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32);
- uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
- d01 = vminq_u16(d01, max);
- vst1q_u16(dst_ptr + y * dst_stride + x, d01);
- }
- }
- }
+static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
+
+ sum = vqrshlq_s32(sum, shift_s32);
+ return vqmovun_s32(sum);
}
-void av1_highbd_convolve_2d_scale_neon(
- const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_qn,
- const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params, int bd) {
- uint16_t *im_block = (uint16_t *)aom_memalign(
- 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
- if (!im_block) return;
- uint16_t *im_block2 = (uint16_t *)aom_memalign(
- 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
- if (!im_block2) {
- aom_free(im_block); // free the first block and return.
- return;
- }
-
- int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
- filter_params_y->taps;
- const int im_stride = MAX_SB_SIZE;
- const int bits =
- FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
- assert(bits >= 0);
-
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const int x_offset_bits = (1 << (bd + FILTER_BITS - 1));
- const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int y_offset_correction =
- ((1 << (y_offset_bits - conv_params->round_1)) +
- (1 << (y_offset_bits - conv_params->round_1 - 1)));
-
- CONV_BUF_TYPE *dst16 = conv_params->dst;
- const int dst16_stride = conv_params->dst_stride;
-
- const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-
- highbd_convolve_2d_x_scale_8tap_neon(
- src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn,
- filter_params_x, conv_params, x_offset_bits);
- if (conv_params->is_compound && !conv_params->do_average) {
- highbd_convolve_2d_y_scale_8tap_neon(
- im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params->round_1, y_offset_bits);
- } else {
- highbd_convolve_2d_y_scale_8tap_neon(
- im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params->round_1, y_offset_bits);
- }
-
- // Do the compound averaging outside the loop, avoids branching within the
- // main loop
- if (conv_params->is_compound) {
- if (conv_params->do_average) {
- if (conv_params->use_dist_wtd_comp_avg) {
- highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
- h, conv_params, bits, y_offset_correction,
- bd);
- } else {
- highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
- conv_params, bits, y_offset_correction, bd);
- }
- }
- } else {
- highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride,
- w, h, bits, y_offset_correction, bd);
- }
- aom_free(im_block);
- aom_free(im_block2);
+static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
+
+ sum0 = vqrshlq_s32(sum0, shift_s32);
+ sum1 = vqrshlq_s32(sum1, shift_s32);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
}
-static INLINE void highbd_convolve_dist_wtd_x_8tap_neon(
+static INLINE void highbd_convolve_2d_sr_horiz_12tap_neon(
const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
const int offset) {
- const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ // The smallest block height processed by the SIMD functions is 4, and the
+ // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+ // for the vertical convolution.
+ assert(h >= 5);
const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
- const int weight_bits = FILTER_BITS - conv_params->round_1;
- const int32x4_t zero_s32 = vdupq_n_s32(0);
- const int32x4_t weight_s32 = vdupq_n_s32(1 << weight_bits);
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
const int32x4_t offset_s32 = vdupq_n_s32(offset);
- if (w <= 4) {
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0, d1;
- uint16x8_t d01;
-
+ if (w == 4) {
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
do {
- load_s16_8x2(s, src_stride, &s0, &s2);
- load_s16_8x2(s + 8, src_stride, &s1, &s3);
+ int16x4_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+ uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+ uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+ uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
- d0 = highbd_convolve8_wtd_horiz4_s32_s16(
- s0, s1, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
- d1 = highbd_convolve8_wtd_horiz4_s32_s16(
- s2, s3, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
- d01 = vcombine_u16(d0, d1);
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- }
+ do {
+ int16x4_t s0[12];
+ load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5],
+ &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]);
- s += 2 * src_stride;
- d += 2 * dst_stride;
- h -= 2;
- } while (h > 0);
+ uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+
+ vst1_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
} else {
int height = h;
- int16x8_t s0, s1, s2, s3;
- uint16x8_t d0, d1;
do {
int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
- load_s16_8x2(s, src_stride, &s0, &s2);
- s += 8;
-
do {
- load_s16_8x2(s, src_stride, &s1, &s3);
+ int16x8_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x8_t d0 = highbd_convolve12_8_2d_h(
+ s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ uint16x8_t d1 = highbd_convolve12_8_2d_h(
+ s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ uint16x8_t d2 = highbd_convolve12_8_2d_h(
+ s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ uint16x8_t d3 = highbd_convolve12_8_2d_h(
+ s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- d0 = highbd_convolve8_wtd_horiz8_s32_s16(
- s0, s1, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
- d1 = highbd_convolve8_wtd_horiz8_s32_s16(
- s2, s3, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
-
- store_u16_8x2(d, dst_stride, d0, d1);
-
- s0 = s1;
- s2 = s3;
s += 8;
d += 8;
width -= 8;
- } while (width > 0);
- src_ptr += 2 * src_stride;
- dst_ptr += 2 * dst_stride;
- height -= 2;
- } while (height > 0);
- }
-}
-
-static INLINE void highbd_convolve_dist_wtd_y_8tap_neon(
- const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
- int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
- const int offset) {
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
- const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
- const int weight_bits = FILTER_BITS - conv_params->round_1;
- const int32x4_t zero_s32 = vdupq_n_s32(0);
- const int32x4_t weight_s32 = vdupq_n_s32(1 << weight_bits);
- const int32x4_t offset_s32 = vdupq_n_s32(offset);
-
- if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t d0, d1;
- uint16x8_t d01;
-
- const int16_t *s = (const int16_t *)src_ptr;
- uint16_t *d = dst_ptr;
-
- load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- do {
- load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = highbd_convolve8_wtd_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, shift_s32, zero_s32,
- weight_s32, offset_s32);
- d1 = highbd_convolve8_wtd_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, shift_s32, zero_s32,
- weight_s32, offset_s32);
- d01 = vcombine_u16(d0, d1);
-
- if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- } else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- }
-
- s0 = s2;
- s1 = s3;
- s2 = s4;
- s3 = s5;
- s4 = s6;
- s5 = s7;
- s6 = s8;
- s += 2 * src_stride;
- d += 2 * dst_stride;
- h -= 2;
- } while (h > 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x8_t d0, d1;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
do {
- int height = h;
+ int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
do {
- load_s16_8x2(s, src_stride, &s7, &s8);
-
- d0 = highbd_convolve8_wtd_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, shift_s32, zero_s32,
- weight_s32, offset_s32);
- d1 = highbd_convolve8_wtd_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, shift_s32, zero_s32,
- weight_s32, offset_s32);
-
- store_u16_8x2(d, dst_stride, d0, d1);
-
- s0 = s2;
- s1 = s3;
- s2 = s4;
- s3 = s5;
- s4 = s6;
- s5 = s7;
- s6 = s8;
- s += 2 * src_stride;
- d += 2 * dst_stride;
- height -= 2;
- } while (height > 0);
- src_ptr += 8;
- dst_ptr += 8;
- w -= 8;
- } while (w > 0);
+ int16x8_t s0[12];
+ load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+
+ uint16x8_t d0 = highbd_convolve12_8_2d_h(
+ s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
}
}
-void av1_highbd_dist_wtd_convolve_x_neon(
- const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ if (w == 2 || h == 2) {
+ av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params, bd);
+ return;
+ }
DECLARE_ALIGNED(16, uint16_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
- CONV_BUF_TYPE *dst16 = conv_params->dst;
- int dst16_stride = conv_params->dst_stride;
+ const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+ const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
const int im_stride = MAX_SB_SIZE;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- assert(round_bits >= 0);
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = clamped_x_taps / 2 - 1;
+ const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
+ const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
+ // simple shift left instead of a rounding saturating shift left.
+ const int y_offset =
+ (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
+
+ const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
- src -= horiz_offset;
+ if (x_filter_taps > 8) {
+ highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_ptr,
+ conv_params, x_offset_initial);
- // horizontal filter
- if (conv_params->do_average) {
- highbd_convolve_dist_wtd_x_8tap_neon(src, src_stride, im_block, im_stride,
- w, h, x_filter_ptr, conv_params,
- round_offset);
+ highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_ptr, conv_params, bd,
+ y_offset);
+ return;
+ }
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_ptr,
+ conv_params, x_offset_initial);
} else {
- highbd_convolve_dist_wtd_x_8tap_neon(src, src_stride, dst16, dst16_stride,
- w, h, x_filter_ptr, conv_params,
- round_offset);
+ highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+ w, im_h, x_filter_ptr, conv_params,
+ x_offset_initial);
}
- if (conv_params->do_average) {
- if (conv_params->use_dist_wtd_comp_avg) {
- highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits, round_offset, bd);
- } else {
- highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits, round_offset, bd);
- }
+ if (y_filter_taps <= 6) {
+ highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_ptr, conv_params, bd,
+ y_offset);
+ } else {
+ highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_ptr, conv_params, bd,
+ y_offset);
}
}
-void av1_highbd_dist_wtd_convolve_y_neon(
+// Filter used is [64, 64].
+void av1_highbd_convolve_x_sr_intrabc_neon(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(16, uint16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
- CONV_BUF_TYPE *dst16 = conv_params->dst;
- int dst16_stride = conv_params->dst_stride;
- const int im_stride = MAX_SB_SIZE;
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- assert(round_bits >= 0);
-
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ assert(subpel_x_qn == 8);
+ assert(filter_params_x->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)conv_params;
+ (void)bd;
- src -= vert_offset * src_stride;
-
- // vertical filter
- if (conv_params->do_average) {
- highbd_convolve_dist_wtd_y_8tap_neon(src, src_stride, im_block, im_stride,
- w, h, y_filter_ptr, conv_params,
- round_offset);
- } else {
- highbd_convolve_dist_wtd_y_8tap_neon(src, src_stride, dst16, dst16_stride,
- w, h, y_filter_ptr, conv_params,
- round_offset);
- }
-
- if (conv_params->do_average) {
- if (conv_params->use_dist_wtd_comp_avg) {
- highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits, round_offset, bd);
- } else {
- highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits, round_offset, bd);
- }
- }
-}
-
-static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
- uint16_t *dst_ptr, int dst_stride, int w,
- int h, const int round_bits,
- const int offset) {
if (w <= 4) {
- const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
- const uint16x4_t offset_u16 = vdup_n_u16(offset);
+ do {
+ uint16x4_t s0 = vld1_u16(src);
+ uint16x4_t s1 = vld1_u16(src + 1);
+
+ uint16x4_t d0 = vrhadd_u16(s0, s1);
- for (int y = 0; y < h; ++y) {
- const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
- uint16x4_t d = vshl_u16(s, round_shift_s16);
- d = vadd_u16(d, offset_u16);
if (w == 2) {
- store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+ store_u16_2x1(dst, d0, 0);
} else {
- vst1_u16(dst_ptr + y * dst_stride, d);
+ vst1_u16(dst, d0);
}
- }
+
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
- const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
- const uint16x8_t offset_u16 = vdupq_n_u16(offset);
-
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; x += 8) {
- const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
- uint16x8_t d = vshlq_u16(s, round_shift_s16);
- d = vaddq_u16(d, offset_u16);
- vst1q_u16(dst_ptr + y * dst_stride + x, d);
- }
- }
- }
-}
+ do {
+ const uint16_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+ int width = w;
-void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
- int src_stride, uint16_t *dst,
- int dst_stride, int w, int h,
- ConvolveParams *conv_params,
- int bd) {
- DECLARE_ALIGNED(16, uint16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 1);
- const int im_stride = MAX_SB_SIZE;
- CONV_BUF_TYPE *dst16 = conv_params->dst;
- int dst16_stride = conv_params->dst_stride;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- assert(round_bits >= 0);
-
- if (conv_params->do_average) {
- highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
- round_offset);
- } else {
- highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
- round_offset);
- }
+ uint16x8_t d0 = vrhaddq_u16(s0, s1);
- if (conv_params->do_average) {
- if (conv_params->use_dist_wtd_comp_avg) {
- highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits, round_offset, bd);
- } else {
- highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits, round_offset, bd);
- }
+ vst1q_u16(dst_ptr, d0);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
}
}
-static INLINE void highbd_convolve_y_8tap_neon(
- const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
- int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
- int offset) {
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
- const int32x4_t offset_s32 = vdupq_n_s32(offset);
- const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_1);
+// Filter used is [64, 64].
+void av1_highbd_convolve_y_sr_intrabc_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ int bd) {
+ assert(subpel_y_qn == 8);
+ assert(filter_params_y->taps == 2);
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)bd;
if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
-
- const int16_t *s = (const int16_t *)src_ptr;
- uint16_t *d = dst_ptr;
-
- load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
do {
- load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+ uint16x4_t s0 = vld1_u16(src);
+ uint16x4_t s1 = vld1_u16(src + src_stride);
- d0 = highbd_convolve8_sr_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, shift_s32, offset_s32);
- d1 = highbd_convolve8_sr_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, shift_s32, offset_s32);
- d2 = highbd_convolve8_sr_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
- y_filter, shift_s32, offset_s32);
- d3 = highbd_convolve8_sr_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
- y_filter, shift_s32, offset_s32);
-
- d01 = vcombine_u16(d0, d1);
- d23 = vcombine_u16(d2, d3);
+ uint16x4_t d0 = vrhadd_u16(s0, s1);
if (w == 2) {
- store_u16q_2x1(d + 0 * dst_stride, d01, 0);
- store_u16q_2x1(d + 1 * dst_stride, d01, 2);
- if (h != 2) {
- store_u16q_2x1(d + 2 * dst_stride, d23, 0);
- store_u16q_2x1(d + 3 * dst_stride, d23, 2);
- }
+ store_u16_2x1(dst, d0, 0);
} else {
- vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
- if (h != 2) {
- vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
- vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
- }
+ vst1_u16(dst, d0);
}
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3;
do {
+ const uint16_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
int height = h;
- const int16_t *s = (const int16_t *)src_ptr;
- uint16_t *d = dst_ptr;
-
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
do {
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + src_stride);
- d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, offset_s32);
- d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, offset_s32);
- d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
- y_filter, offset_s32);
- d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
- y_filter, offset_s32);
-
- if (h == 2) {
- store_u16_8x2(d, dst_stride, d0, d1);
- } else {
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
- }
+ uint16x8_t d0 = vrhaddq_u16(s0, s1);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- src_ptr += 8;
- dst_ptr += 8;
+ vst1q_u16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ src += 8;
+ dst += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
-void av1_highbd_dist_wtd_convolve_2d_neon(
+// Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
+void av1_highbd_convolve_2d_sr_intrabc_neon(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ assert(subpel_x_qn == 8);
+ assert(subpel_y_qn == 8);
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)conv_params;
+ (void)bd;
+
DECLARE_ALIGNED(16, uint16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint16_t,
- im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + 1;
+ int im_stride = MAX_SB_SIZE;
- CONV_BUF_TYPE *dst16 = conv_params->dst;
- int dst16_stride = conv_params->dst_stride;
+ uint16x8_t vert_offset = vdupq_n_u16(1);
- const int im_h = h + filter_params_y->taps - 1;
- const int im_stride = MAX_SB_SIZE;
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
- const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int y_offset_initial = (1 << y_offset_bits);
- const int y_offset_correction =
- ((1 << (y_offset_bits - conv_params->round_1)) +
- (1 << (y_offset_bits - conv_params->round_1 - 1)));
+ uint16_t *im = im_block;
- const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ // Horizontal filter.
+ if (w <= 4) {
+ do {
+ uint16x4_t s0 = vld1_u16(src);
+ uint16x4_t s1 = vld1_u16(src + 1);
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ uint16x4_t d0 = vadd_u16(s0, s1);
+
+ // Safe to store the whole vector, the im buffer is big enough.
+ vst1_u16(im, d0);
- // horizontal filter
- highbd_convolve_x_8tap_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
- x_filter_ptr, conv_params, x_offset_initial);
- // vertical filter
- if (conv_params->do_average) {
- highbd_convolve_y_8tap_neon(im_block, im_stride, im_block2, im_stride, w, h,
- y_filter_ptr, conv_params, y_offset_initial);
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
} else {
- highbd_convolve_y_8tap_neon(im_block, im_stride, dst16, dst16_stride, w, h,
- y_filter_ptr, conv_params, y_offset_initial);
- }
+ do {
+ const uint16_t *src_ptr = src;
+ uint16_t *im_ptr = im;
+ int width = w;
- // Do the compound averaging outside the loop, avoids branching within the
- // main loop
- if (conv_params->do_average) {
- if (conv_params->use_dist_wtd_comp_avg) {
- highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits,
- y_offset_correction, bd);
- } else {
- highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
- conv_params, round_bits, y_offset_correction, bd);
- }
- }
-}
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 1);
-#define UPSCALE_NORMATIVE_TAPS 8
+ uint16x8_t d0 = vaddq_u16(s0, s1);
-void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w,
- int h, const int16_t *x_filters,
- int x0_qn, int x_step_qn, int bd) {
- const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;
+ vst1q_u16(im_ptr, d0);
- const int32x4_t idx = { 0, 1, 2, 3 };
- const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
- const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
- const int32x4_t offset_s32 = vdupq_n_s32(0);
- const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ src_ptr += 8;
+ im_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
+ }
- const uint16_t *src_ptr = src - horiz_offset;
- uint16_t *dst_ptr = dst;
+ im = im_block;
+ // Vertical filter.
if (w <= 4) {
- int height = h;
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0;
-
- uint16_t *d = dst_ptr;
do {
- int x_qn = x0_qn;
-
- // Load 4 src vectors at a time, they might be the same, but we have to
- // calculate the indices anyway. Doing it in SIMD and then storing the
- // indices is faster than having to calculate the expression
- // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
- // Ideally this should be a gather using the indices, but NEON does not
- // have that, so have to emulate
- const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
- // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
- // 2
- const int32x4_t src_idx =
- vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
- // Similarly for the filter vector indices, we calculate the filter
- // indices for 4 columns. First we calculate the indices:
- // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
- // Then we calculate the actual pointers, multiplying with
- // UPSCALE_UPSCALE_NORMATIVE_TAPS
- // again shift left by 1
- const int32x4_t x_filter4_idx = vshlq_n_s32(
- vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
- // Even though pointers are unsigned 32/64-bit ints we do signed
- // addition The reason for this is that x_qn can be negative, leading to
- // negative offsets. Argon test
- // profile0_core/streams/test10573_11003.obu was failing because of
- // this.
-#if AOM_ARCH_AARCH64
- uint64x2_t tmp4[2];
- tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
- vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
- tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
- vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
- int16_t *src4_ptr[4];
- uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
- vst1q_u64(tmp_ptr, tmp4[0]);
- vst1q_u64(tmp_ptr + 2, tmp4[1]);
-
- // filter vectors
- tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
- vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
- vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
- tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
- vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
- vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
-
- const int16_t *x_filter4_ptr[4];
- tmp_ptr = (uint64_t *)&x_filter4_ptr;
- vst1q_u64(tmp_ptr, tmp4[0]);
- vst1q_u64(tmp_ptr + 2, tmp4[1]);
-#else
- uint32x4_t tmp4;
- tmp4 = vreinterpretq_u32_s32(
- vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
- int16_t *src4_ptr[4];
- uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
- vst1q_u32(tmp_ptr, tmp4);
- // filter vectors
- tmp4 = vreinterpretq_u32_s32(
- vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
- vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
-
- const int16_t *x_filter4_ptr[4];
- tmp_ptr = (uint32_t *)&x_filter4_ptr;
- vst1q_u32(tmp_ptr, tmp4);
-#endif // AOM_ARCH_AARCH64
- // Load source
- s0 = vld1q_s16(src4_ptr[0]);
- s1 = vld1q_s16(src4_ptr[1]);
- s2 = vld1q_s16(src4_ptr[2]);
- s3 = vld1q_s16(src4_ptr[3]);
-
- // Actually load the filters
- const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
- const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
- const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
- const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
-
- // Group low and high parts and transpose
- int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
- vget_low_s16(x_filter1),
- vget_low_s16(x_filter2),
- vget_low_s16(x_filter3) };
- int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
- vget_high_s16(x_filter1),
- vget_high_s16(x_filter2),
- vget_high_s16(x_filter3) };
- transpose_u16_4x4((uint16x4_t *)filters_lo);
- transpose_u16_4x4((uint16x4_t *)filters_hi);
-
- // Run the 2D Scale convolution
- d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
- s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
-
- d0 = vmin_u16(d0, max);
+ uint16x4_t s0 = vld1_u16(im);
+ uint16x4_t s1 = vld1_u16(im + im_stride);
+
+ uint16x4_t d0 = vhadd_u16(s0, s1);
+ d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
if (w == 2) {
- store_u16_2x1(d + 0 * dst_stride, d0, 0);
+ store_u16_2x1(dst, d0, 0);
} else {
- vst1_u16(d + 0 * dst_stride, d0);
+ vst1_u16(dst, d0);
}
- src_ptr += src_stride;
- d += dst_stride;
- height--;
- } while (height > 0);
+ im += im_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
- int height = h;
- int16x8_t s0, s1, s2, s3;
- uint16x4_t d0;
-
do {
- int width = w;
- int x_qn = x0_qn;
- uint16_t *d = dst_ptr;
- const uint16_t *s = src_ptr;
+ uint16_t *im_ptr = im;
+ uint16_t *dst_ptr = dst;
+ int height = h;
do {
- // Load 4 src vectors at a time, they might be the same, but we have to
- // calculate the indices anyway. Doing it in SIMD and then storing the
- // indices is faster than having to calculate the expression
- // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
- // Ideally this should be a gather using the indices, but NEON does not
- // have that, so have to emulate
- const int32x4_t xqn_idx =
- vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
- // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
- // = 2
- const int32x4_t src_idx =
- vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
-
- // Similarly for the filter vector indices, we calculate the filter
- // indices for 4 columns. First we calculate the indices:
- // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
- // Then we calculate the actual pointers, multiplying with
- // UPSCALE_UPSCALE_NORMATIVE_TAPS
- // again shift left by 1
- const int32x4_t x_filter4_idx = vshlq_n_s32(
- vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
- 1);
- // Even though pointers are unsigned 32/64-bit ints we do signed
- // addition The reason for this is that x_qn can be negative, leading to
- // negative offsets. Argon test
- // profile0_core/streams/test10573_11003.obu was failing because of
- // this.
-#if AOM_ARCH_AARCH64
- uint64x2_t tmp4[2];
- tmp4[0] = vreinterpretq_u64_s64(
- vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
- tmp4[1] = vreinterpretq_u64_s64(
- vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
- int16_t *src4_ptr[4];
- uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
- vst1q_u64(tmp_ptr, tmp4[0]);
- vst1q_u64(tmp_ptr + 2, tmp4[1]);
-
- // filter vectors
- tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
- vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
- vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
- tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
- vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
- vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
-
- const int16_t *x_filter4_ptr[4];
- tmp_ptr = (uint64_t *)&x_filter4_ptr;
- vst1q_u64(tmp_ptr, tmp4[0]);
- vst1q_u64(tmp_ptr + 2, tmp4[1]);
-#else
- uint32x4_t tmp4;
- tmp4 = vreinterpretq_u32_s32(
- vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
- int16_t *src4_ptr[4];
- uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
- vst1q_u32(tmp_ptr, tmp4);
- // filter vectors
- tmp4 = vreinterpretq_u32_s32(
- vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
- vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
-
- const int16_t *x_filter4_ptr[4];
- tmp_ptr = (uint32_t *)&x_filter4_ptr;
- vst1q_u32(tmp_ptr, tmp4);
-#endif // AOM_ARCH_AARCH64
-
- // Load source
- s0 = vld1q_s16(src4_ptr[0]);
- s1 = vld1q_s16(src4_ptr[1]);
- s2 = vld1q_s16(src4_ptr[2]);
- s3 = vld1q_s16(src4_ptr[3]);
-
- // Actually load the filters
- const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
- const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
- const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
- const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
-
- // Group low and high parts and transpose
- int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
- vget_low_s16(x_filter1),
- vget_low_s16(x_filter2),
- vget_low_s16(x_filter3) };
- int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
- vget_high_s16(x_filter1),
- vget_high_s16(x_filter2),
- vget_high_s16(x_filter3) };
- transpose_u16_4x4((uint16x4_t *)filters_lo);
- transpose_u16_4x4((uint16x4_t *)filters_hi);
-
- // Run the 2D Scale X convolution
- d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
- s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
-
- d0 = vmin_u16(d0, max);
- vst1_u16(d, d0);
-
- x_qn += 4 * x_step_qn;
- d += 4;
- width -= 4;
- } while (width > 0);
+ uint16x8_t s0 = vld1q_u16(im_ptr);
+ uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
+ uint16x8_t d0 = vhaddq_u16(s0, s1);
+ d0 = vhaddq_u16(d0, vert_offset);
+
+ vst1q_u16(dst_ptr, d0);
+
+ im_ptr += im_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ im += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
}
}
diff --git a/av1/common/arm/highbd_convolve_neon.h b/av1/common/arm/highbd_convolve_neon.h
index f9d028fd5..08b2bda4e 100644
--- a/av1/common/arm/highbd_convolve_neon.h
+++ b/av1/common/arm/highbd_convolve_neon.h
@@ -14,68 +14,9 @@
#include <arm_neon.h>
-static INLINE int32x4_t highbd_convolve6_4_s32(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x8_t y_filter, const int32x4_t offset) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
- int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 1);
- sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
- sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
- sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
- sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
- sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
-
- return sum;
-}
-
-static INLINE uint16x4_t highbd_convolve6_4_s32_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x8_t y_filter, const int32x4_t offset) {
- int32x4_t sum =
- highbd_convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, offset);
-
- return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE void highbd_convolve6_8_s32(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t y_filter, const int32x4_t offset, int32x4_t *sum0,
- int32x4_t *sum1) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
- *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 1);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 2);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 3);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_hi, 0);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 1);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 2);
-
- *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 1);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 2);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 3);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_hi, 0);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 1);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 2);
-}
-
-static INLINE uint16x8_t highbd_convolve6_8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t y_filter, const int32x4_t offset) {
- int32x4_t sum0;
- int32x4_t sum1;
- highbd_convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, offset, &sum0,
- &sum1);
-
- return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
- vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/convolve.h"
static INLINE int32x4_t highbd_convolve8_4_s32(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
@@ -97,18 +38,7 @@ static INLINE int32x4_t highbd_convolve8_4_s32(
return sum;
}
-static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
- const int32x4_t offset) {
- int32x4_t sum =
- highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset);
-
- return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x4_t highbd_convolve8_sr_4_s32_s16(
+static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -120,23 +50,8 @@ static INLINE uint16x4_t highbd_convolve8_sr_4_s32_s16(
return vqmovun_s32(sum);
}
-static INLINE uint16x4_t highbd_convolve8_wtd_4_s32_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
- const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
- const int32x4_t offset2) {
- int32x4_t sum =
- highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset);
-
- sum = vqrshlq_s32(sum, shift_s32);
- sum = vmlaq_s32(offset2, sum, weight);
-
- return vqmovun_s32(sum);
-}
-
// Like above but also perform round shifting and subtract correction term
-static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16(
+static INLINE uint16x4_t highbd_convolve8_4_srsub_s32_s16(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -176,41 +91,8 @@ static INLINE void highbd_convolve8_8_s32(
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
}
-static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
- const int32x4_t offset) {
- int32x4_t sum0;
- int32x4_t sum1;
- highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset,
- &sum0, &sum1);
-
- return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
- vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-static INLINE uint16x8_t highbd_convolve8_wtd_8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
- const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
- const int32x4_t offset2) {
- int32x4_t sum0;
- int32x4_t sum1;
- highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset,
- &sum0, &sum1);
-
- sum0 = vqrshlq_s32(sum0, shift_s32);
- sum1 = vqrshlq_s32(sum1, shift_s32);
- sum0 = vmlaq_s32(offset2, sum0, weight);
- sum1 = vmlaq_s32(offset2, sum1, weight);
-
- return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
// Like above but also perform round shifting and subtract correction term
-static INLINE uint16x8_t highbd_convolve8_8_sr_s32_s16(
+static INLINE uint16x8_t highbd_convolve8_8_srsub_s32_s16(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
@@ -227,290 +109,6 @@ static INLINE uint16x8_t highbd_convolve8_8_sr_s32_s16(
return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
}
-static INLINE int32x4_t highbd_convolve12_y_4_s32(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
- const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
- const int32x4_t offset) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-
- int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
- sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
- sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
- sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
- sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
- sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
- sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
- sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
- sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
- sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
- sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
- sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
-
- return sum;
-}
-
-static INLINE uint16x4_t highbd_convolve12_y_4_s32_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
- const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
- const int32x4_t offset) {
- int32x4_t sum =
- highbd_convolve12_y_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, y_filter_0_7, y_filter_8_11, offset);
-
- return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-// Like above but also perform round shifting and subtract correction term
-static INLINE uint16x4_t highbd_convolve12_y_4_sr_s32_s16(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
- const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
- const int32x4_t round_shift, const int32x4_t offset,
- const int32x4_t correction) {
- int32x4_t sum =
- highbd_convolve12_y_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
- s11, y_filter_0_7, y_filter_8_11, offset);
-
- sum = vsubq_s32(vqrshlq_s32(sum, round_shift), correction);
- return vqmovun_s32(sum);
-}
-
-static INLINE void highbd_convolve12_y_8_s32(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
- const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
- const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-
- *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_0_3, 1);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_0_3, 2);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_0_3, 3);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_4_7, 0);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_4_7, 1);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_4_7, 2);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_4_7, 3);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s8), y_filter_8_11, 0);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s9), y_filter_8_11, 1);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s10), y_filter_8_11, 2);
- *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s11), y_filter_8_11, 3);
-
- *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_0_3, 1);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_0_3, 2);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_0_3, 3);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_4_7, 0);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_4_7, 1);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_4_7, 2);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_4_7, 3);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s8), y_filter_8_11, 0);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s9), y_filter_8_11, 1);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s10), y_filter_8_11, 2);
- *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s11), y_filter_8_11, 3);
-}
-
-static INLINE uint16x8_t highbd_convolve12_y_8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
- const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
- const int32x4_t offset) {
- int32x4_t sum0;
- int32x4_t sum1;
- highbd_convolve12_y_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- y_filter_0_7, y_filter_8_11, offset, &sum0, &sum1);
-
- return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
- vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-// Like above but also perform round shifting and subtract correction term
-static INLINE uint16x8_t highbd_convolve12_y_8_sr_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
- const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
- const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
- const int32x4_t round_shift, const int32x4_t offset,
- const int32x4_t correction) {
- int32x4_t sum0;
- int32x4_t sum1;
- highbd_convolve12_y_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- y_filter_0_7, y_filter_8_11, offset, &sum0, &sum1);
-
- sum0 = vsubq_s32(vqrshlq_s32(sum0, round_shift), correction);
- sum1 = vsubq_s32(vqrshlq_s32(sum1, round_shift), correction);
-
- return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
-static INLINE int32x4_t highbd_convolve8_horiz4_s32(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
- const int32x4_t offset) {
- const int16x8_t s2 = vextq_s16(s0, s1, 1);
- const int16x8_t s3 = vextq_s16(s0, s1, 2);
- const int16x8_t s4 = vextq_s16(s0, s1, 3);
- const int16x4_t s0_lo = vget_low_s16(s0);
- const int16x4_t s1_lo = vget_low_s16(s2);
- const int16x4_t s2_lo = vget_low_s16(s3);
- const int16x4_t s3_lo = vget_low_s16(s4);
- const int16x4_t s4_lo = vget_high_s16(s0);
- const int16x4_t s5_lo = vget_high_s16(s2);
- const int16x4_t s6_lo = vget_high_s16(s3);
- const int16x4_t s7_lo = vget_high_s16(s4);
-
- return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
- s7_lo, x_filter_0_7, offset);
-}
-
-static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
- const int32x4_t shift_s32, const int32x4_t offset) {
- int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7, offset);
-
- sum = vqrshlq_s32(sum, shift_s32);
- return vqmovun_s32(sum);
-}
-
-static INLINE uint16x4_t highbd_convolve8_wtd_horiz4_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
- const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
- const int32x4_t offset2) {
- int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7, offset);
-
- sum = vqrshlq_s32(sum, shift_s32);
- sum = vmlaq_s32(offset2, sum, weight);
- return vqmovun_s32(sum);
-}
-
-static INLINE void highbd_convolve8_horiz8_s32(
- const int16x8_t s0, const int16x8_t s0_hi, const int16x8_t x_filter_0_7,
- const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) {
- const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
- const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
- const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
- const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
- const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
- const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
- const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
-
- highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, offset,
- sum0, sum1);
-}
-
-static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
- const int32x4_t shift_s32, const int32x4_t offset) {
- int32x4_t sum0, sum1;
- highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, offset, &sum0, &sum1);
-
- sum0 = vqrshlq_s32(sum0, shift_s32);
- sum1 = vqrshlq_s32(sum1, shift_s32);
-
- return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
-static INLINE uint16x8_t highbd_convolve8_wtd_horiz8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
- const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
- const int32x4_t offset2) {
- int32x4_t sum0, sum1;
- highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, offset, &sum0, &sum1);
-
- sum0 = vqrshlq_s32(sum0, shift_s32);
- sum1 = vqrshlq_s32(sum1, shift_s32);
- sum0 = vmlaq_s32(offset2, sum0, weight);
- sum1 = vmlaq_s32(offset2, sum1, weight);
-
- return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
-static INLINE int32x4_t highbd_convolve12_horiz4_s32(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
- const int16x4_t x_filter_8_11, const int32x4_t offset) {
- const int16x8_t s2 = vextq_s16(s0, s1, 1);
- const int16x8_t s3 = vextq_s16(s0, s1, 2);
- const int16x8_t s4 = vextq_s16(s0, s1, 3);
- const int16x8_t s5 = vextq_s16(s0, s1, 4);
- const int16x8_t s6 = vextq_s16(s0, s1, 5);
- const int16x8_t s7 = vextq_s16(s0, s1, 6);
- const int16x8_t s8 = vextq_s16(s0, s1, 7);
- const int16x4_t s0_lo = vget_low_s16(s0);
- const int16x4_t s1_lo = vget_low_s16(s2);
- const int16x4_t s2_lo = vget_low_s16(s3);
- const int16x4_t s3_lo = vget_low_s16(s4);
- const int16x4_t s4_lo = vget_high_s16(s0);
- const int16x4_t s5_lo = vget_high_s16(s2);
- const int16x4_t s6_lo = vget_high_s16(s3);
- const int16x4_t s7_lo = vget_high_s16(s4);
- const int16x4_t s8_lo = vget_high_s16(s5);
- const int16x4_t s9_lo = vget_high_s16(s6);
- const int16x4_t s10_lo = vget_high_s16(s7);
- const int16x4_t s11_lo = vget_high_s16(s8);
-
- return highbd_convolve12_y_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo,
- s6_lo, s7_lo, s8_lo, s9_lo, s10_lo, s11_lo,
- x_filter_0_7, x_filter_8_11, offset);
-}
-
-static INLINE uint16x4_t highbd_convolve12_horiz4_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
- const int16x4_t x_filter_8_11, const int32x4_t shift_s32,
- const int32x4_t offset) {
- int32x4_t sum =
- highbd_convolve12_horiz4_s32(s0, s1, x_filter_0_7, x_filter_8_11, offset);
-
- sum = vqrshlq_s32(sum, shift_s32);
- return vqmovun_s32(sum);
-}
-
-static INLINE void highbd_convolve12_horiz8_s32(
- const int16x8_t s0_0, const int16x8_t s0_1, const int16x8_t s0_2,
- const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
- const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) {
- const int16x8_t s1 = vextq_s16(s0_0, s0_1, 1);
- const int16x8_t s2 = vextq_s16(s0_0, s0_1, 2);
- const int16x8_t s3 = vextq_s16(s0_0, s0_1, 3);
- const int16x8_t s4 = vextq_s16(s0_0, s0_1, 4);
- const int16x8_t s5 = vextq_s16(s0_0, s0_1, 5);
- const int16x8_t s6 = vextq_s16(s0_0, s0_1, 6);
- const int16x8_t s7 = vextq_s16(s0_0, s0_1, 7);
- const int16x8_t s8 = s0_1;
- const int16x8_t s9 = vextq_s16(s0_1, s0_2, 1);
- const int16x8_t s10 = vextq_s16(s0_1, s0_2, 2);
- const int16x8_t s11 = vextq_s16(s0_1, s0_2, 3);
-
- highbd_convolve12_y_8_s32(s0_0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
- x_filter_0_7, x_filter_8_11, offset, sum0, sum1);
-}
-
-static INLINE uint16x8_t highbd_convolve12_horiz8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
- const int32x4_t shift_s32, const int32x4_t offset) {
- int32x4_t sum0, sum1;
- highbd_convolve12_horiz8_s32(s0, s1, s2, x_filter_0_7, x_filter_8_11, offset,
- &sum0, &sum1);
-
- sum0 = vqrshlq_s32(sum0, shift_s32);
- sum1 = vqrshlq_s32(sum1, shift_s32);
-
- return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x4_t *filters_lo,
@@ -520,8 +118,8 @@ static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32(
int16x4_t s_hi[] = { vget_high_s16(s0), vget_high_s16(s1), vget_high_s16(s2),
vget_high_s16(s3) };
- transpose_u16_4x4((uint16x4_t *)s_lo);
- transpose_u16_4x4((uint16x4_t *)s_hi);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)s_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)s_hi);
int32x4_t sum = vmlal_s16(offset, s_lo[0], filters_lo[0]);
sum = vmlal_s16(sum, s_lo[1], filters_lo[1]);
diff --git a/av1/common/arm/highbd_convolve_scale_neon.c b/av1/common/arm/highbd_convolve_scale_neon.c
new file mode 100644
index 000000000..eee5a1ca9
--- /dev/null
+++ b/av1/common/arm/highbd_convolve_scale_neon.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+static INLINE void highbd_dist_wtd_comp_avg_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, ConvolveParams *conv_params, const int round_bits,
+ const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const int32x4_t round_shift = vdupq_n_s32(-round_bits);
+ const uint32x4_t offset_vec = vdupq_n_u32(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+ uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+ // Weighted averaging
+ if (w <= 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+ wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+ wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+ d0 = vqrshlq_s32(d0, round_shift);
+
+ uint16x4_t d0_u16 = vqmovun_s32(d0);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ if (w == 2) {
+ store_u16_2x1(dst_ptr, d0_u16, 0);
+ } else {
+ vst1_u16(dst_ptr, d0_u16);
+ }
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+ wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+ wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+ d0 = vqrshlq_s32(d0, round_shift);
+
+ uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+ wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+ wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+ int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+ d1 = vqrshlq_s32(d1, round_shift);
+
+ uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst, d01);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+ uint16_t *dst_ptr, int dst_stride,
+ int w, int h,
+ ConvolveParams *conv_params,
+ const int round_bits, const int offset,
+ const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const int32x4_t round_shift = vdupq_n_s32(-round_bits);
+ const uint16x4_t offset_vec = vdup_n_u16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w <= 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint16x4_t avg = vhadd_u16(src, ref);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+ d0 = vqrshlq_s32(d0, round_shift);
+
+ uint16x4_t d0_u16 = vqmovun_s32(d0);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ if (w == 2) {
+ store_u16_2x1(dst_ptr, d0_u16, 0);
+ } else {
+ vst1_u16(dst_ptr, d0_u16);
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vhaddq_u16(s, r);
+ int32x4_t d0_lo =
+ vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+ int32x4_t d0_hi =
+ vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+ d0_lo = vqrshlq_s32(d0_lo, round_shift);
+ d0_hi = vqrshlq_s32(d0_hi, round_shift);
+
+ uint16x8_t d0 = vcombine_u16(vqmovun_s32(d0_lo), vqmovun_s32(d0_hi));
+ d0 = vminq_u16(d0, max);
+ vst1q_u16(dst, d0);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_2d_x_scale_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int subpel_x_qn, const int x_step_qn,
+ const InterpFilterParams *filter_params, ConvolveParams *conv_params,
+ const int offset) {
+ static const uint32_t kIdx[4] = { 0, 1, 2, 3 };
+ const uint32x4_t idx = vld1q_u32(kIdx);
+ const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK);
+ const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+
+ if (w <= 4) {
+ int height = h;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int x_qn = subpel_x_qn;
+
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
+ // 2
+ const uint32x4_t src_idx_u32 =
+ vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
+#if AOM_ARCH_AARCH64
+ uint64x2_t src4[2];
+ src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
+ vget_low_u32(src_idx_u32));
+ src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
+ vget_high_u32(src_idx_u32));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, src4[0]);
+ vst1q_u64(tmp_ptr + 2, src4[1]);
+#else
+ uint32x4_t src4;
+ src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32);
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, src4);
+#endif // AOM_ARCH_AARCH64
+ // Same for the filter vectors
+ const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
+ vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
+ int32_t x_filter4_idx[4];
+ vst1q_s32(x_filter4_idx, filter_idx_s32);
+ const int16_t *x_filter4_ptr[4];
+
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // We could easily do this using SIMD as well instead of calling the
+ // inline function 4 times.
+ x_filter4_ptr[0] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]);
+ x_filter4_ptr[1] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]);
+ x_filter4_ptr[2] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]);
+ x_filter4_ptr[3] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ if (w == 2) {
+ store_u16_2x1(d + 0 * dst_stride, d0, 0);
+ } else {
+ vst1_u16(d + 0 * dst_stride, d0);
+ }
+
+ src_ptr += src_stride;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ int x_qn = subpel_x_qn;
+ uint16_t *d = dst_ptr;
+ const uint16_t *s = src_ptr;
+
+ do {
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const uint32x4_t xqn_idx =
+ vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
+ // = 2
+ const uint32x4_t src_idx_u32 =
+ vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
+#if AOM_ARCH_AARCH64
+ uint64x2_t src4[2];
+ src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
+ vget_low_u32(src_idx_u32));
+ src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
+ vget_high_u32(src_idx_u32));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, src4[0]);
+ vst1q_u64(tmp_ptr + 2, src4[1]);
+#else
+ uint32x4_t src4;
+ src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32);
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, src4);
+#endif // AOM_ARCH_AARCH64
+ // Same for the filter vectors
+ const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
+ vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
+ int32_t x_filter4_idx[4];
+ vst1q_s32(x_filter4_idx, filter_idx_s32);
+ const int16_t *x_filter4_ptr[4];
+
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // We could easily do this using SIMD as well instead of calling the
+ // inline function 4 times.
+ x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[0]);
+ x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[1]);
+ x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[2]);
+ x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale X convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ vst1_u16(d, d0);
+
+ x_qn += 4 * x_step_qn;
+ d += 4;
+ width -= 4;
+ } while (width > 0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ height--;
+ } while (height > 0);
+ }
+}
+
+static INLINE void highbd_convolve_2d_y_scale_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int subpel_y_qn, const int y_step_qn,
+ const InterpFilterParams *filter_params, const int round1_bits,
+ const int offset) {
+ const int32x4_t offset_s32 = vdupq_n_s32(1 << offset);
+
+ const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits);
+ if (w <= 4) {
+ int height = h;
+ uint16_t *d = dst_ptr;
+ int y_qn = subpel_y_qn;
+
+ do {
+ const int16_t *s =
+ (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ const int16_t *y_filter_ptr =
+ av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ uint16x4_t d0 = highbd_convolve8_4_srsub_s32_s16(
+ s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
+ offset_s32, vdupq_n_s32(0));
+
+ if (w == 2) {
+ store_u16_2x1(d, d0, 0);
+ } else {
+ vst1_u16(d, d0);
+ }
+
+ y_qn += y_step_qn;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ } else {
+ int width = w;
+
+ do {
+ int height = h;
+ int y_qn = subpel_y_qn;
+
+ uint16_t *d = dst_ptr;
+
+ do {
+ const int16_t *s =
+ (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ const int16_t *y_filter_ptr =
+ av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ uint16x8_t d0 = highbd_convolve8_8_srsub_s32_s16(
+ s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
+ offset_s32, vdupq_n_s32(0));
+ vst1q_u16(d, d0);
+
+ y_qn += y_step_qn;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
+
+static INLINE void highbd_convolve_correct_offset_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int round_bits, const int offset, const int bd) {
+ const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
+ const int16x4_t offset_s16 = vdup_n_s16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w <= 4) {
+ for (int y = 0; y < h; ++y) {
+ const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride);
+ const int32x4_t d0 =
+ vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32);
+ uint16x4_t d = vqmovun_s32(d0);
+ d = vmin_u16(d, vget_low_u16(max));
+ if (w == 2) {
+ store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+ } else {
+ vst1_u16(dst_ptr + y * dst_stride, d);
+ }
+ }
+ } else {
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; x += 8) {
+ // Subtract round offset and convolve round
+ const int16x8_t s =
+ vld1q_s16((const int16_t *)src_ptr + y * src_stride + x);
+ const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16),
+ round_shift_s32);
+ const int32x4_t d1 = vqrshlq_s32(
+ vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32);
+ uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst_ptr + y * dst_stride + x, d01);
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ uint16_t *im_block = (uint16_t *)aom_memalign(
+ 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
+ if (!im_block) return;
+ uint16_t *im_block2 = (uint16_t *)aom_memalign(
+ 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
+ if (!im_block2) {
+ aom_free(im_block); // free the first block and return.
+ return;
+ }
+
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ const int im_stride = MAX_SB_SIZE;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const int x_offset_bits = (1 << (bd + FILTER_BITS - 1));
+ const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int y_offset_correction =
+ ((1 << (y_offset_bits - conv_params->round_1)) +
+ (1 << (y_offset_bits - conv_params->round_1 - 1)));
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+
+ const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ highbd_convolve_2d_x_scale_8tap_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn,
+ filter_params_x, conv_params, x_offset_bits);
+ if (conv_params->is_compound && !conv_params->do_average) {
+ highbd_convolve_2d_y_scale_8tap_neon(
+ im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params->round_1, y_offset_bits);
+ } else {
+ highbd_convolve_2d_y_scale_8tap_neon(
+ im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params->round_1, y_offset_bits);
+ }
+
+ // Do the compound averaging outside the loop, avoids branching within the
+ // main loop
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
+ h, conv_params, bits, y_offset_correction,
+ bd);
+ } else {
+ highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+ conv_params, bits, y_offset_correction, bd);
+ }
+ }
+ } else {
+ highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride,
+ w, h, bits, y_offset_correction, bd);
+ }
+ aom_free(im_block);
+ aom_free(im_block2);
+}
diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
index d197fca6c..84bc8fd96 100644
--- a/av1/common/arm/highbd_inv_txfm_neon.c
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -590,7 +590,7 @@ static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
int bd, int out_shift) {
const int32_t *sinpi = sinpi_arr(bit);
const int32x4_t zero = vdupq_n_s32(0);
- int64x2_t rnding = vdupq_n_s64(1 << (bit + 4 - 1));
+ int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1));
const int32x2_t mul = vdup_n_s32(1 << 4);
int32x4_t t;
int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
diff --git a/av1/common/arm/highbd_reconinter_neon.c b/av1/common/arm/highbd_reconinter_neon.c
new file mode 100644
index 000000000..573d3c165
--- /dev/null
+++ b/av1/common/arm/highbd_reconinter_neon.c
@@ -0,0 +1,330 @@
+/*
+ *
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+
+static INLINE void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse,
+ const uint16_t *src0,
+ int src0_stride,
+ const uint16_t *src1,
+ int src1_stride, int h, int w,
+ const unsigned int bd) {
+ assert(DIFF_FACTOR > 0);
+ uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+ uint8x16_t mask_base = vdupq_n_u8(38);
+ uint8x16_t mask_diff = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA - 38);
+
+ if (bd == 8) {
+ if (w >= 16) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+ uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+ uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+ uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+ uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+ uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(mask_diff, diff);
+ } else {
+ m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+ }
+
+ vst1q_u8(mask_ptr, m);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0_ptr);
+ uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ vst1_u8(mask_ptr, m);
+
+ src0_ptr += 8;
+ src1_ptr += 8;
+ mask_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ store_u8_4x1(mask, m, 0);
+ store_u8_4x1(mask + w, m, 1);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ mask += 2 * w;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (bd == 10) {
+ if (w >= 16) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+ uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+ uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+ uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+ uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+ uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(mask_diff, diff);
+ } else {
+ m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+ }
+
+ vst1q_u8(mask_ptr, m);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0_ptr);
+ uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ vst1_u8(mask_ptr, m);
+
+ src0_ptr += 8;
+ src1_ptr += 8;
+ mask_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ store_u8_4x1(mask, m, 0);
+ store_u8_4x1(mask + w, m, 1);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ mask += 2 * w;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ assert(bd == 12);
+ if (w >= 16) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+ uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+ uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+ uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+ uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+ uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(mask_diff, diff);
+ } else {
+ m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+ }
+
+ vst1q_u8(mask_ptr, m);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0_ptr);
+ uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ vst1_u8(mask_ptr, m);
+
+ src0_ptr += 8;
+ src1_ptr += 8;
+ mask_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ store_u8_4x1(mask, m, 0);
+ store_u8_4x1(mask + w, m, 1);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ mask += 2 * w;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_neon(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ assert(h % 4 == 0);
+ assert(w % 4 == 0);
+ assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
+
+ if (mask_type == DIFFWTD_38) {
+ diffwtd_mask_highbd_neon(mask, /*inverse=*/false, CONVERT_TO_SHORTPTR(src0),
+ src0_stride, CONVERT_TO_SHORTPTR(src1),
+ src1_stride, h, w, bd);
+ } else { // mask_type == DIFFWTD_38_INV
+ diffwtd_mask_highbd_neon(mask, /*inverse=*/true, CONVERT_TO_SHORTPTR(src0),
+ src0_stride, CONVERT_TO_SHORTPTR(src1),
+ src1_stride, h, w, bd);
+ }
+}
diff --git a/av1/common/arm/highbd_reconintra_neon.c b/av1/common/arm/highbd_reconintra_neon.c
new file mode 100644
index 000000000..170491b50
--- /dev/null
+++ b/av1/common/arm/highbd_reconintra_neon.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+
+#define MAX_UPSAMPLE_SZ 16
+
+void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+ assert(sz >= 0 && sz <= 129);
+
+ DECLARE_ALIGNED(16, static const uint16_t,
+ idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 };
+ const uint16x8_t index = vld1q_u16(idx);
+
+ uint16_t edge[160]; // Max value of sz + enough padding for vector accesses.
+ memcpy(edge + 1, p, sz * sizeof(*p));
+
+ // Populate extra space appropriately.
+ edge[0] = edge[1];
+ edge[sz + 1] = edge[sz];
+ edge[sz + 2] = edge[sz];
+
+ // Don't overwrite first pixel.
+ uint16_t *dst = p + 1;
+ sz--;
+
+ if (strength == 1) { // Filter: {4, 8, 4}.
+ const uint16_t *src = edge + 1;
+
+ while (sz >= 8) {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ // Make use of the identity:
+ // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+ uint16x8_t t0 = vaddq_u16(s0, s2);
+ uint16x8_t t1 = vaddq_u16(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+ vst1q_u16(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ // Make use of the identity:
+ // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+ uint16x8_t t0 = vaddq_u16(s0, s2);
+ uint16x8_t t1 = vaddq_u16(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+ // Mask off out-of-bounds indices.
+ uint16x8_t current_dst = vld1q_u16(dst);
+ uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+ res = vbslq_u16(mask, res, current_dst);
+
+ vst1q_u16(dst, res);
+ }
+ } else if (strength == 2) { // Filter: {5, 6, 5}.
+ const uint16_t *src = edge + 1;
+
+ const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6),
+ vdupq_n_u16(5) } };
+ while (sz >= 8) {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+ accum = vmlaq_u16(accum, s1, filter.val[1]);
+ accum = vmlaq_u16(accum, s2, filter.val[2]);
+ uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+ vst1q_u16(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+ accum = vmlaq_u16(accum, s1, filter.val[1]);
+ accum = vmlaq_u16(accum, s2, filter.val[2]);
+ uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+ // Mask off out-of-bounds indices.
+ uint16x8_t current_dst = vld1q_u16(dst);
+ uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+ res = vbslq_u16(mask, res, current_dst);
+
+ vst1q_u16(dst, res);
+ }
+ } else { // Filter {2, 4, 4, 4, 2}.
+ const uint16_t *src = edge;
+
+ while (sz >= 8) {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+ uint16x8_t s4 = vld1q_u16(src + 4);
+
+ // Make use of the identity:
+ // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+ uint16x8_t t0 = vaddq_u16(s0, s4);
+ uint16x8_t t1 = vaddq_u16(s1, s2);
+ t1 = vaddq_u16(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+ vst1q_u16(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+ uint16x8_t s4 = vld1q_u16(src + 4);
+
+ // Make use of the identity:
+ // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+ uint16x8_t t0 = vaddq_u16(s0, s4);
+ uint16x8_t t1 = vaddq_u16(s1, s2);
+ t1 = vaddq_u16(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+ // Mask off out-of-bounds indices.
+ uint16x8_t current_dst = vld1q_u16(dst);
+ uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+ res = vbslq_u16(mask, res, current_dst);
+
+ vst1q_u16(dst, res);
+ }
+ }
+}
+
+void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) {
+ if (!sz) return;
+
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint16_t edge[MAX_UPSAMPLE_SZ + 3];
+ const uint16_t *src = edge;
+
+ // Copy p[-1..(sz-1)] and pad out both ends.
+ edge[0] = p[-1];
+ edge[1] = p[-1];
+ memcpy(edge + 2, p, sz * 2);
+ edge[sz + 2] = p[sz - 1];
+ p[-2] = p[-1];
+
+ uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1);
+
+ uint16_t *dst = p - 1;
+
+ if (bd == 12) {
+ do {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+
+ uint16x8_t t0 = vaddq_u16(s1, s2);
+ uint16x8_t t1 = vaddq_u16(s0, s3);
+ uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9);
+ acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1)));
+ uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9);
+ acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1)));
+
+ uint16x8x2_t res;
+ res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4));
+ // Clamp pixel values at bitdepth maximum.
+ res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+ res.val[1] = s2;
+
+ vst2q_u16(dst, res);
+
+ src += 8;
+ dst += 16;
+ sz -= 8;
+ } while (sz > 0);
+ } else { // Bit depth is 8 or 10.
+ do {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+
+ uint16x8_t t0 = vaddq_u16(s0, s3);
+ uint16x8_t t1 = vaddq_u16(s1, s2);
+ t1 = vmulq_n_u16(t1, 9);
+ t1 = vqsubq_u16(t1, t0);
+
+ uint16x8x2_t res;
+ res.val[0] = vrshrq_n_u16(t1, 4);
+ // Clamp pixel values at bitdepth maximum.
+ res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+ res.val[1] = s2;
+
+ vst2q_u16(dst, res);
+
+ src += 8;
+ dst += 16;
+ sz -= 8;
+ } while (sz > 0);
+ }
+}
diff --git a/av1/common/arm/highbd_warp_plane_neon.c b/av1/common/arm/highbd_warp_plane_neon.c
new file mode 100644
index 000000000..0729df6e3
--- /dev/null
+++ b/av1/common/arm/highbd_warp_plane_neon.c
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/scale.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+static INLINE int16x8_t load_filters_1(int ofs) {
+ const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
+
+ const int16_t *base =
+ (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+ return vld1q_s16(base + ofs0 * 8);
+}
+
+static INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) {
+ const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS);
+ const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS);
+ const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
+ const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
+
+ const int16_t *base =
+ (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+ out[0] = vld1q_s16(base + ofs0 * 8);
+ out[1] = vld1q_s16(base + ofs1 * 8);
+ out[2] = vld1q_s16(base + ofs2 * 8);
+ out[3] = vld1q_s16(base + ofs3 * 8);
+}
+
+static INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) {
+ const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS);
+ const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS);
+ const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
+ const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
+ const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS);
+ const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS);
+ const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
+ const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
+
+ const int16_t *base =
+ (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+ out[0] = vld1q_s16(base + ofs0 * 8);
+ out[1] = vld1q_s16(base + ofs1 * 8);
+ out[2] = vld1q_s16(base + ofs2 * 8);
+ out[3] = vld1q_s16(base + ofs3 * 8);
+ out[4] = vld1q_s16(base + ofs4 * 8);
+ out[5] = vld1q_s16(base + ofs5 * 8);
+ out[6] = vld1q_s16(base + ofs6 * 8);
+ out[7] = vld1q_s16(base + ofs7 * 8);
+}
+
+static INLINE int16x8_t warp_affine_horizontal_step_4x1_f4_neon(
+ int bd, int sx, int alpha, uint16x8x2_t in) {
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 0);
+ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 1);
+ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 2);
+ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 3);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
+ m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
+ int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
+ m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1));
+ int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2));
+ m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2));
+ int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3));
+ m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ int32x4_t res = horizontal_add_4d_s32x4(m0123);
+ res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+ res = vrshlq_s32(res, vdupq_n_s32(-round0));
+ return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t warp_affine_horizontal_step_8x1_f8_neon(
+ int bd, int sx, int alpha, uint16x8x2_t in) {
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 0);
+ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 1);
+ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 2);
+ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 3);
+ int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 4);
+ int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 5);
+ int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 6);
+ int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 7);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
+ m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
+ int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
+ m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1));
+ int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2));
+ m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2));
+ int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3));
+ m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3));
+ int32x4_t m4 = vmull_s16(vget_low_s16(f[4]), vget_low_s16(rv4));
+ m4 = vmlal_s16(m4, vget_high_s16(f[4]), vget_high_s16(rv4));
+ int32x4_t m5 = vmull_s16(vget_low_s16(f[5]), vget_low_s16(rv5));
+ m5 = vmlal_s16(m5, vget_high_s16(f[5]), vget_high_s16(rv5));
+ int32x4_t m6 = vmull_s16(vget_low_s16(f[6]), vget_low_s16(rv6));
+ m6 = vmlal_s16(m6, vget_high_s16(f[6]), vget_high_s16(rv6));
+ int32x4_t m7 = vmull_s16(vget_low_s16(f[7]), vget_low_s16(rv7));
+ m7 = vmlal_s16(m7, vget_high_s16(f[7]), vget_high_s16(rv7));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+ int32x4_t m4567[] = { m4, m5, m6, m7 };
+
+ int32x4_t res0 = horizontal_add_4d_s32x4(m0123);
+ int32x4_t res1 = horizontal_add_4d_s32x4(m4567);
+ res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+ res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+ res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+ res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+ return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE void warp_affine_horizontal_neon(const uint16_t *ref, int width,
+ int height, int stride,
+ int p_width, int16_t alpha,
+ int16_t beta, int iy4, int sx4,
+ int ix4, int16x8_t tmp[],
+ int bd) {
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+
+ if (ix4 <= -7) {
+ for (int k = 0; k < 15; ++k) {
+ int iy = clamp(iy4 + k - 7, 0, height - 1);
+ int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - round0));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ } else if (ix4 >= width + 6) {
+ for (int k = 0; k < 15; ++k) {
+ int iy = clamp(iy4 + k - 7, 0, height - 1);
+ int32_t dup_val =
+ (1 << (bd + FILTER_BITS - round0 - 1)) +
+ ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ }
+
+ for (int k = 0; k < 15; ++k) {
+ const int iy = clamp(iy4 + k - 7, 0, height - 1);
+ uint16x8x2_t in = vld1q_u16_x2(ref + iy * stride + ix4 - 7);
+
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ const uint16_t k0[16] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 };
+ const uint16x8_t indx0 = vld1q_u16(&k0[0]);
+ const uint16x8_t indx1 = vld1q_u16(&k0[8]);
+
+ if (out_of_boundary_left >= 0) {
+ uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left);
+ uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]);
+ uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec);
+ uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec);
+ in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]);
+ in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]);
+ }
+ if (out_of_boundary_right >= 0) {
+ uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right);
+ uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]);
+ uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec);
+ uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec);
+ in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]);
+ in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]);
+ }
+
+ const int sx = sx4 + beta * (k - 3);
+ if (p_width == 4) {
+ tmp[k] = warp_affine_horizontal_step_4x1_f4_neon(bd, sx, alpha, in);
+ } else {
+ tmp[k] = warp_affine_horizontal_step_8x1_f8_neon(bd, sx, alpha, in);
+ }
+ }
+}
+
+static INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) {
+ const int limit = (1 << bd) - 1;
+ return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit)));
+}
+
+static INLINE int32x4_t
+warp_affine_vertical_filter_4x1_f1_neon(const int16x8_t *tmp, int sy) {
+ const int16x8_t f = load_filters_1(sy);
+ const int16x4_t f0123 = vget_low_s16(f);
+ const int16x4_t f4567 = vget_high_s16(f);
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+ return m0123;
+}
+
+static INLINE int32x4x2_t
+warp_affine_vertical_filter_8x1_f1_neon(const int16x8_t *tmp, int sy) {
+ const int16x8_t f = load_filters_1(sy);
+ const int16x4_t f0123 = vget_low_s16(f);
+ const int16x4_t f4567 = vget_high_s16(f);
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3);
+ return (int32x4x2_t){ { m0123, m4567 } };
+}
+
+static INLINE int32x4_t warp_affine_vertical_filter_4x1_f4_neon(
+ const int16x8_t *tmp, int sy, int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]),
+ vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]),
+ vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+ return horizontal_add_4d_s32x4(m0123);
+}
+
+static INLINE int32x4x2_t warp_affine_vertical_filter_8x1_f8_neon(
+ const int16x8_t *tmp, int sy, int gamma) {
+ int16x8_t s0 = tmp[0];
+ int16x8_t s1 = tmp[1];
+ int16x8_t s2 = tmp[2];
+ int16x8_t s3 = tmp[3];
+ int16x8_t s4 = tmp[4];
+ int16x8_t s5 = tmp[5];
+ int16x8_t s6 = tmp[6];
+ int16x8_t s7 = tmp[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+ int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+ m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+ int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+ m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+ int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+ m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+ int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+ m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+ int32x4_t m4567[] = { m4, m5, m6, m7 };
+
+ int32x4x2_t ret;
+ ret.val[0] = horizontal_add_4d_s32x4(m0123);
+ ret.val[1] = horizontal_add_4d_s32x4(m4567);
+ return ret;
+}
+
+static INLINE void warp_affine_vertical_step_4x1_f4_neon(
+ uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride,
+ bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd,
+ int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) {
+ int32x4_t sum0 =
+ gamma == 0 ? warp_affine_vertical_filter_4x1_f1_neon(tmp, sy)
+ : warp_affine_vertical_filter_4x1_f4_neon(tmp, sy, gamma);
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - round0;
+
+ sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert));
+
+ uint16_t *dst16 = &pred[i * p_stride + j];
+
+ if (!is_compound) {
+ const int reduce_bits_vert = 2 * FILTER_BITS - round0;
+ sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert));
+
+ const int res_sub_const = (1 << (bd - 1)) + (1 << bd);
+ sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const));
+ uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd);
+ vst1_u16(dst16, res0);
+ return;
+ }
+
+ sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS);
+
+ uint16_t *p = &dst[i * dst_stride + j];
+
+ if (!do_average) {
+ vst1_u16(p, vqmovun_s32(sum0));
+ return;
+ }
+
+ uint16x4_t p0 = vld1_u16(p);
+ int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0));
+ if (use_dist_wtd_comp_avg) {
+ p_vec0 = vmulq_n_s32(p_vec0, fwd);
+ p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd);
+ p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS);
+ } else {
+ p_vec0 = vhaddq_s32(p_vec0, sum0);
+ }
+
+ const int offset_bits = bd + 2 * FILTER_BITS - round0;
+ const int round1 = COMPOUND_ROUND1_BITS;
+ const int res_sub_const =
+ (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1));
+ const int round_bits = 2 * FILTER_BITS - round0 - round1;
+
+ p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const));
+ p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits));
+ uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd);
+ vst1_u16(dst16, res0);
+}
+
+static INLINE void warp_affine_vertical_step_8x1_f8_neon(
+ uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride,
+ bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd,
+ int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) {
+ int32x4x2_t sums =
+ gamma == 0 ? warp_affine_vertical_filter_8x1_f1_neon(tmp, sy)
+ : warp_affine_vertical_filter_8x1_f8_neon(tmp, sy, gamma);
+ int32x4_t sum0 = sums.val[0];
+ int32x4_t sum1 = sums.val[1];
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - round0;
+
+ sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert));
+ sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert));
+
+ uint16_t *dst16 = &pred[i * p_stride + j];
+
+ if (!is_compound) {
+ const int reduce_bits_vert = 2 * FILTER_BITS - round0;
+ sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert));
+ sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert));
+
+ const int res_sub_const = (1 << (bd - 1)) + (1 << bd);
+ sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const));
+ sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const));
+ uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd);
+ uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd);
+ vst1_u16(dst16, res0);
+ vst1_u16(dst16 + 4, res1);
+ return;
+ }
+
+ sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS);
+ sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS);
+
+ uint16_t *p = &dst[i * dst_stride + j];
+
+ if (!do_average) {
+ vst1_u16(p, vqmovun_s32(sum0));
+ vst1_u16(p + 4, vqmovun_s32(sum1));
+ return;
+ }
+
+ uint16x8_t p0 = vld1q_u16(p);
+ int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0)));
+ int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0)));
+ if (use_dist_wtd_comp_avg) {
+ p_vec0 = vmulq_n_s32(p_vec0, fwd);
+ p_vec1 = vmulq_n_s32(p_vec1, fwd);
+ p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd);
+ p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd);
+ p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS);
+ p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS);
+ } else {
+ p_vec0 = vhaddq_s32(p_vec0, sum0);
+ p_vec1 = vhaddq_s32(p_vec1, sum1);
+ }
+
+ const int offset_bits = bd + 2 * FILTER_BITS - round0;
+ const int round1 = COMPOUND_ROUND1_BITS;
+ const int res_sub_const =
+ (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1));
+ const int round_bits = 2 * FILTER_BITS - round0 - round1;
+
+ p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const));
+ p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const));
+
+ p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits));
+ p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits));
+ uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd);
+ uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd);
+ vst1_u16(dst16, res0);
+ vst1_u16(dst16 + 4, res1);
+}
+
+static INLINE void warp_affine_vertical_neon(
+ uint16_t *pred, int p_width, int p_height, int p_stride, int bd,
+ uint16_t *dst, int dst_stride, bool is_compound, bool do_average,
+ bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta,
+ const int16x8_t *tmp, int i, int sy4, int j) {
+ int limit_height = p_height > 4 ? 8 : 4;
+
+ if (p_width > 4) {
+ // p_width == 8
+ for (int k = 0; k < limit_height; ++k) {
+ int sy = sy4 + delta * k;
+ warp_affine_vertical_step_8x1_f8_neon(
+ pred, p_stride, bd, dst, dst_stride, is_compound, do_average,
+ use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j);
+ }
+ } else {
+ // p_width == 4
+ for (int k = 0; k < limit_height; ++k) {
+ int sy = sy4 + delta * k;
+ warp_affine_vertical_step_4x1_f4_neon(
+ pred, p_stride, bd, dst, dst_stride, is_compound, do_average,
+ use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j);
+ }
+ }
+}
+
+void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ uint16_t *const dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const bool is_compound = conv_params->is_compound;
+ const bool do_average = conv_params->do_average;
+ const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int fwd = conv_params->fwd_offset;
+ const int bwd = conv_params->bck_offset;
+
+ assert(IMPLIES(is_compound, dst != NULL));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (j + 4 + p_col) << subsampling_x;
+ const int32_t src_y = (i + 4 + p_row) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4);
+ sy4 += gamma * (-4) + delta * (-4);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Each horizontal filter result is formed by the sum of up to eight
+ // multiplications by filter values and then a shift. Although both the
+ // inputs and filters are loaded as int16, the input data is at most bd
+ // bits and the filters are at most 8 bits each. Additionally since we
+ // know all possible filter values we know that the sum of absolute
+ // filter values will fit in at most 9 bits. With this in mind we can
+ // conclude that the sum of each filter application will fit in bd + 9
+ // bits. The shift following the summation is ROUND0_BITS (which is 3),
+ // +2 for 12-bit, which gives us a final storage of:
+ // bd == 8: ( 8 + 9) - 3 => 14 bits
+ // bd == 10: (10 + 9) - 3 => 16 bits
+ // bd == 12: (12 + 9) - 5 => 16 bits
+ // So it is safe to use int16x8_t as the intermediate storage type here.
+ int16x8_t tmp[15];
+
+ warp_affine_horizontal_neon(ref, width, height, stride, p_width, alpha,
+ beta, iy4, sx4, ix4, tmp, bd);
+ warp_affine_vertical_neon(pred, p_width, p_height, p_stride, bd, dst,
+ dst_stride, is_compound, do_average,
+ use_dist_wtd_comp_avg, fwd, bwd, gamma, delta,
+ tmp, i, sy4, j);
+ }
+ }
+}
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
new file mode 100644
index 000000000..a6bd6d38e
--- /dev/null
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#define HBD_WIENER_5TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s04 = vaddq_s16(s0, s4); \
+ int16x8_t s13 = vaddq_s16(s1, s3); \
+ \
+ /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_5tap_horiz( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4; \
+ load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_h( \
+ s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
+
+HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_5TAP_HORIZ
+
+#define HBD_WIENER_7TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \
+ const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \
+ const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s06 = vaddq_s16(s0, s6); \
+ int16x8_t s15 = vaddq_s16(s1, s5); \
+ int16x8_t s24 = vaddq_s16(s2, s4); \
+ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_7tap_horiz( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6; \
+ load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \
+ s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
+
+HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_7TAP_HORIZ
+
+#define HBD_WIENER_5TAP_VERT(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve5_8_2d_v( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \
+ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int32x4_t s04_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s4)); \
+ int32x4_t s13_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s3)); \
+ \
+ /* y_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \
+ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s04_lo, y_filter_lo, 1); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s13_lo, y_filter_hi, 0); \
+ sum_lo = \
+ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s2)), y_filter_hi, 1); \
+ \
+ int32x4_t s04_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s4)); \
+ int32x4_t s13_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s3)); \
+ \
+ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s04_hi, y_filter_lo, 1); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s13_hi, y_filter_hi, 0); \
+ sum_hi = \
+ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s2)), y_filter_hi, 1); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_5tap_vert( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int height = h; \
+ \
+ while (height > 3) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; \
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \
+ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \
+ uint16x8_t d1 = name##_wiener_convolve5_8_2d_v( \
+ s1, s2, s3, s4, s5, y_filter, round_vec, res_max_val); \
+ uint16x8_t d2 = name##_wiener_convolve5_8_2d_v( \
+ s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ uint16x8_t d3 = name##_wiener_convolve5_8_2d_v( \
+ s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \
+ \
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \
+ \
+ s += 4 * src_stride; \
+ d += 4 * dst_stride; \
+ height -= 4; \
+ } \
+ \
+ while (height-- != 0) { \
+ int16x8_t s0, s1, s2, s3, s4; \
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \
+ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += src_stride; \
+ d += dst_stride; \
+ } \
+ \
+ src_ptr += 8; \
+ dst_ptr += 8; \
+ w -= 8; \
+ } while (w != 0); \
+ }
+
+HBD_WIENER_5TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_5TAP_VERT
+
+#define HBD_WIENER_7TAP_VERT(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \
+ const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \
+ const uint16x8_t res_max_val) { \
+ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \
+ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); \
+ int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); \
+ int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); \
+ \
+ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); \
+ sum_lo = \
+ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); \
+ \
+ int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); \
+ int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); \
+ int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); \
+ \
+ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); \
+ sum_hi = \
+ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_7tap_vert( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int height = h; \
+ \
+ while (height > 3) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \
+ load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, \
+ &s8, &s9); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ uint16x8_t d1 = name##_wiener_convolve7_8_2d_v( \
+ s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \
+ uint16x8_t d2 = name##_wiener_convolve7_8_2d_v( \
+ s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val); \
+ uint16x8_t d3 = name##_wiener_convolve7_8_2d_v( \
+ s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val); \
+ \
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \
+ \
+ s += 4 * src_stride; \
+ d += 4 * dst_stride; \
+ height -= 4; \
+ } \
+ \
+ while (height-- != 0) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6; \
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += src_stride; \
+ d += dst_stride; \
+ } \
+ \
+ src_ptr += 8; \
+ dst_ptr += 8; \
+ w -= 8; \
+ } while (w != 0); \
+ }
+
+HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_7TAP_VERT
+
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+ assert(filter[7] == 0);
+ if (filter[0] == 0 && filter[6] == 0) {
+ return WIENER_WIN_REDUCED;
+ }
+ return WIENER_WIN;
+}
+
+void av1_highbd_wiener_convolve_add_src_neon(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4,
+ const int16_t *y_filter, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ assert(w % 8 == 0);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(x_filter[7] == 0 && y_filter[7] == 0);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+
+ const int x_filter_taps = get_wiener_filter_taps(x_filter);
+ const int y_filter_taps = get_wiener_filter_taps(y_filter);
+ int16x4_t x_filter_s16 = vld1_s16(x_filter);
+ int16x4_t y_filter_s16 = vld1_s16(y_filter);
+ // Add 128 to tap 3. (Needed for rounding.)
+ x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+ y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
+
+ const int im_stride = MAX_SB_SIZE;
+ const int im_h = h + y_filter_taps - 1;
+ const int horiz_offset = x_filter_taps / 2;
+ const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
+
+ const int extraprec_clamp_limit =
+ WIENER_CLAMP_LIMIT(conv_params->round_0, bd);
+ const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1);
+ const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+
+ const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1);
+ const int32x4_t vert_round_vec =
+ vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1)));
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ if (bd == 12) {
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_12_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_12_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_12_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ } else {
+ highbd_12_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ }
+
+ } else {
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w,
+ h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ } else {
+ highbd_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w,
+ h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ }
+ }
+}
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
deleted file mode 100644
index 564f7c23b..000000000
--- a/av1/common/arm/jnt_convolve_neon.c
+++ /dev/null
@@ -1,5336 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/transpose_neon.h"
-#include "aom_ports/mem.h"
-#include "av1/common/common.h"
-#include "av1/common/arm/convolve_neon.h"
-
-#if !AOM_ARCH_AARCH64
-static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
- const uint16_t fwd_offset,
- const uint16_t bck_offset,
- const int16x4_t round_offset,
- uint8x8_t *d0_u8) {
- uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
- blend0 = vmlal_n_u16(blend0, d0, bck_offset);
-
- uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
-
- int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
-
- int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
-
- *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
- const int16x4_t round_offset,
- uint8x8_t *d0_u8) {
- uint16x4_t avg0 = vhadd_u16(dd0, d0);
-
- int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
-
- int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
-
- *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
- const uint16_t fwd_offset,
- const uint16_t bck_offset,
- const int16x8_t round_offset,
- uint8x8_t *d0_u8) {
- uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
- blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
- uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
- blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
-
- uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
- vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
-
- int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
-
- *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
- const int16x8_t round_offset,
- uint8x8_t *d0_u8) {
- uint16x8_t avg0 = vhaddq_u16(dd0, d0);
-
- int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
-
- *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
-}
-
-#endif // !AOM_ARCH_AARCH64
-
-static INLINE void compute_dist_wtd_avg_4x4(
- uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
- uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
- const uint16_t fwd_offset, const uint16_t bck_offset,
- const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
- uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
- blend0 = vmlal_n_u16(blend0, d0, bck_offset);
- uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset);
- blend1 = vmlal_n_u16(blend1, d1, bck_offset);
- uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset);
- blend2 = vmlal_n_u16(blend2, d2, bck_offset);
- uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset);
- blend3 = vmlal_n_u16(blend3, d3, bck_offset);
-
- uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
- uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS);
- uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS);
- uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS);
-
- int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
- int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
-
- dst_01 = vsubq_s16(dst_01, round_offset);
- dst_23 = vsubq_s16(dst_23, round_offset);
-
- *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
- *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
- uint16x4_t dd2, uint16x4_t dd3,
- uint16x4_t d0, uint16x4_t d1,
- uint16x4_t d2, uint16x4_t d3,
- const int16x8_t round_offset,
- uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
- uint16x4_t avg0 = vhadd_u16(dd0, d0);
- uint16x4_t avg1 = vhadd_u16(dd1, d1);
- uint16x4_t avg2 = vhadd_u16(dd2, d2);
- uint16x4_t avg3 = vhadd_u16(dd3, d3);
-
- int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
- int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
-
- dst_01 = vsubq_s16(dst_01, round_offset);
- dst_23 = vsubq_s16(dst_23, round_offset);
-
- *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
- *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_dist_wtd_avg_8x4(
- uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
- uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
- const uint16_t fwd_offset, const uint16_t bck_offset,
- const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8,
- uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
- uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
- blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
- uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
- blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
-
- uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset);
- blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset);
- uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset);
- blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset);
-
- uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset);
- blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset);
- uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset);
- blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset);
-
- uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset);
- blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset);
- uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset);
- blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset);
-
- uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
- vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
- uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS),
- vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS));
- uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS),
- vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS));
- uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS),
- vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS));
-
- int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
- int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
- int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
- int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
-
- *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
- *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
- *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
- *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
- uint16x8_t dd2, uint16x8_t dd3,
- uint16x8_t d0, uint16x8_t d1,
- uint16x8_t d2, uint16x8_t d3,
- const int16x8_t round_offset,
- uint8x8_t *d0_u8, uint8x8_t *d1_u8,
- uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
- uint16x8_t avg0, avg1, avg2, avg3;
-
- avg0 = vhaddq_u16(dd0, d0);
- avg1 = vhaddq_u16(dd1, d1);
- avg2 = vhaddq_u16(dd2, d2);
- avg3 = vhaddq_u16(dd3, d3);
-
- int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
- int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
- int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
- int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
-
- *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
- *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
- *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
- *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
-}
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_2d_h(uint8x16_t samples,
- const int8x8_t x_filter,
- const uint8x16x2_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[2];
- int32x4_t sum;
-
- // Permute samples ready for dot product.
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- // First 4 output values.
- sum = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
- sum = vusdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vshrn_n_s32(sum, ROUND0_BITS - 1);
-}
-
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
- const int8x8_t x_filter,
- const uint8x16x3_t permute_tbl,
- const int32x4_t horiz_const) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum[2];
-
- // Permute samples ready for dot product.
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- // First 4 output values.
- sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
- sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
- // Second 4 output values.
- sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
- sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
- // Narrow and re-pack.
- // We halved the convolution filter values so -1 from the right shift.
- return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
- vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE void dist_wtd_convolve_2d_horiz_8tap_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
- const int16x8_t x_filter_s16, const int im_h, int w) {
- const int bd = 8;
- // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // (The extra -1 is needed because we halved the filter values.)
- const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
- // Horizontal filter.
- const int8x8_t x_filter = vmovn_s16(x_filter_s16);
-
- const uint8_t *src_ptr = src;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
- int height = im_h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x4_t d0, d1, d2, d3;
-
- do {
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
- d1 = convolve8_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
- d2 = convolve8_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
- d3 = convolve8_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
- store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x8_t d0, d1, d2, d3;
-
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
- d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
- d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
- d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int16x4_t convolve8_4_2d_h(uint8x16_t samples,
- const int8x8_t x_filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- // Clamp sample range to [-128, 127] for 8-bit signed dot product.
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- // Permute samples ready for dot product.
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- // Accumulate dot product into 'correction' to account for range clamp.
- sum = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vshrn_n_s32(sum, ROUND0_BITS - 1);
-}
-
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
- const int8x8_t x_filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum[2];
-
- // Clamp sample range to [-128, 127] for 8-bit signed dot product.
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- // Permute samples ready for dot product. */
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- // Accumulate dot product into 'correction' to account for range clamp.
- // First 4 output values.
- sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
- sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
- // Second 4 output values.
- sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
- sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
- // Narrow and re-pack.
- // We halved the convolution filter values so -1 from the right shift.
- return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
- vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE void dist_wtd_convolve_2d_horiz_8tap_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
- const int16x8_t x_filter_s16, const int im_h, int w) {
- const int bd = 8;
- const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
- // Dot product constants and other shims.
- const int32_t correction_s32 = vaddlvq_s16(vshlq_n_s16(x_filter_s16, 7));
- // Fold horiz_const into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
- const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
- (1 << ((ROUND0_BITS - 1) - 1)));
- const uint8x16_t range_limit = vdupq_n_u8(128);
- // Horizontal filter.
- const int8x8_t x_filter = vmovn_s16(x_filter_s16);
-
- const uint8_t *src_ptr = src;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
- int height = im_h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x4_t d0, d1, d2, d3;
-
- do {
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
- d1 = convolve8_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
- d2 = convolve8_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
- d3 = convolve8_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
-
- store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- uint8x16_t s0, s1, s2, s3;
- int16x8_t d0, d1, d2, d3;
-
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
- permute_tbl);
- d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
- permute_tbl);
- d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
- permute_tbl);
- d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
- permute_tbl);
-
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- }
-}
-
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE int16x4_t convolve8_4_2d_h(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t x_filter,
- const int16x4_t horiz_const) {
- const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
- const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
- int16x4_t sum = horiz_const;
- sum = vmla_lane_s16(sum, s0, x_filter_0_3, 0);
- sum = vmla_lane_s16(sum, s1, x_filter_0_3, 1);
- sum = vmla_lane_s16(sum, s2, x_filter_0_3, 2);
- sum = vmla_lane_s16(sum, s3, x_filter_0_3, 3);
- sum = vmla_lane_s16(sum, s4, x_filter_4_7, 0);
- sum = vmla_lane_s16(sum, s5, x_filter_4_7, 1);
- sum = vmla_lane_s16(sum, s6, x_filter_4_7, 2);
- sum = vmla_lane_s16(sum, s7, x_filter_4_7, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vshr_n_s16(sum, ROUND0_BITS - 1);
-}
-
-static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t x_filter,
- const int16x8_t horiz_const) {
- const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
- const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
- int16x8_t sum = horiz_const;
- sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
- sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
- sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
- sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
- sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
- sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
- sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
- sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vshrq_n_s16(sum, ROUND0_BITS - 1);
-}
-
-static INLINE void dist_wtd_convolve_2d_horiz_8tap_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
- const int16x8_t x_filter, const int im_h, int w) {
- const int bd = 8;
-
- const uint8_t *src_ptr = src;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
- int height = im_h;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
- uint8x8_t t0;
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10, d1, d2, d3;
- uint8x8_t t1, t2, t3;
-#endif // AOM_ARCH_AARCH64
-
- // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // (The extra -1 is needed because we halved the filter values.)
- const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
- do {
- __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
-
- load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
- load_u8_8x4(src_ptr + 7, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- d1 = convolve8_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const);
- d2 = convolve8_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const);
- d3 = convolve8_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const);
-
- transpose_s16_4x4d(&d0, &d1, &d2, &d3);
- store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); // a0 a1 a2 a3
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); // a4 a5 a6 a7
-
- __builtin_prefetch(dst_ptr);
-
- t0 = vld1_u8(src_ptr + 8); // a8 a9 a10 a11
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
-
- d0 = convolve8_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- vst1_s16(dst_ptr, d0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height > 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, d0;
- uint8x8_t t0;
-#if AOM_ARCH_AARCH64
- int16x8_t s9, s10, s11, s12, s13, s14;
- int16x8_t d1, d2, d3, d4, d5, d6, d7;
- uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // (The extra -1 is needed because we halved the filter values.)
- const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
- do {
- const uint8_t *s;
- int16_t *d = dst_ptr;
- int width = w;
-
-#if AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
-
- load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s = src_ptr + 7;
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
- __builtin_prefetch(dst_ptr + 4 * dst_stride);
- __builtin_prefetch(dst_ptr + 5 * dst_stride);
- __builtin_prefetch(dst_ptr + 6 * dst_stride);
- __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
- do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const);
- d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const);
- d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const);
- d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- horiz_const);
- d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
- horiz_const);
- d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
- horiz_const);
- d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
- horiz_const);
-
- transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
- store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += 8 * src_stride;
- dst_ptr += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- t0 = vld1_u8(src_ptr);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
-
- s = src_ptr + 8;
- __builtin_prefetch(dst_ptr);
-
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
- s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
-
- d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const);
- vst1q_s16(d, d0);
-
- s0 = s8;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height > 0);
- }
-}
-
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint16x4_t
-convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x8_t y_filter, const int32x4_t offset_const) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- int32x4_t sum = offset_const;
- // Filter values at indices 0 and 7 are 0.
- sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1);
- sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
- sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
- sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
- sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
- sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
-
- return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x8_t
-convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t y_filter, const int32x4_t offset_const) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- int32x4_t sum0 = offset_const;
- // Filter values at indices 0 and 7 are 0.
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
-
- int32x4_t sum1 = offset_const;
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
-
- return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
- vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
- int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
- ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5;
- uint16x4_t dd0, d0;
- uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
- int16x4_t s6, s7, s8;
- uint16x4_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d23_u8;
-#endif // AOM_ARCH_AARCH64
-
- load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
- src_ptr += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
-
- d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
- dst8_ptr += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1_s16(src_ptr);
-
- d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
- dd0 = vld1_u16(dst_ptr);
-
- compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
- vget_low_s16(round_offset_vec), &d01_u8);
-
- store_u8_4x1(dst8_ptr, d01_u8, 0);
- dst8_ptr += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5;
- uint16x8_t dd0, d0;
- uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
- int16x8_t s6, s7, s8;
- uint16x8_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif // AOM_ARCH_AARCH64
-
- do {
- int16_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
- s += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
- d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d0_u8, &d1_u8,
- &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1q_s16(s);
-
- d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
- dd0 = vld1q_u16(d);
-
- compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
- round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon(
- int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
- ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5;
- uint16x4_t dd0, d0;
- uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
- int16x4_t s6, s7, s8;
- uint16x4_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d23_u8;
-#endif // AOM_ARCH_AARCH64
-
- load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
- src_ptr += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
-
- d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
- dst8_ptr += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1_s16(src_ptr);
-
- d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
- dd0 = vld1_u16(dst_ptr);
-
- compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
-
- store_u8_4x1(dst8_ptr, d01_u8, 0);
- dst8_ptr += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5;
- uint16x8_t dd0, d0;
- uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
- int16x8_t s6, s7, s8;
- uint16x8_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif // AOM_ARCH_AARCH64
-
- do {
- int16_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
- s += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
- d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1q_s16(s);
-
- d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
- dd0 = vld1q_u16(d);
-
- compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
- int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
- const int16x8_t y_filter, int h, int w) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5;
- uint16x4_t d0;
-#if AOM_ARCH_AARCH64
- int16x4_t s6, s7, s8;
- uint16x4_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
- load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
- src_ptr += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
-
- d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
- store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1_s16(src_ptr);
-
- d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
- vst1_u16(dst_ptr, d0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5;
- uint16x8_t d0;
-#if AOM_ARCH_AARCH64
- int16x8_t s6, s7, s8;
- uint16x8_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
- do {
- int16_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- int height = h;
-
- load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
- s += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
- d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s5 = vld1q_s16(s);
-
- d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
- vst1q_u16(d, d0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-static INLINE uint16x4_t
-convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t y_filter, const int32x4_t offset_const) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- int32x4_t sum = offset_const;
- sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0);
- sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
- sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
- sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
- sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
- sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
- sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
- sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
-
- return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x8_t
-convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t y_filter, const int32x4_t offset_const) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- int32x4_t sum0 = offset_const;
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
-
- int32x4_t sum1 = offset_const;
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
-
- return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
- vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
- int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
- ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t dd0, d0;
- uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10;
- uint16x4_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d23_u8;
-#endif // AOM_ARCH_AARCH64
-
- load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- src_ptr += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
- dst8_ptr += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1_s16(src_ptr);
-
- d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
-
- dd0 = vld1_u16(dst_ptr);
-
- compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
- vget_low_s16(round_offset_vec), &d01_u8);
-
- store_u8_4x1(dst8_ptr, d01_u8, 0);
- dst8_ptr += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t dd0, d0;
- uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10;
- uint16x8_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif // AOM_ARCH_AARCH64
-
- do {
- int16_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d0_u8, &d1_u8,
- &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1q_s16(s);
-
- d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
-
- dd0 = vld1q_u16(d);
-
- compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
- round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon(
- int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
- ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t dd0, d0;
- uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10;
- uint16x4_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d23_u8;
-#endif // AOM_ARCH_AARCH64
-
- load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- src_ptr += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
- dst8_ptr += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1_s16(src_ptr);
-
- d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
-
- dd0 = vld1_u16(dst_ptr);
-
- compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
-
- store_u8_4x1(dst8_ptr, d01_u8, 0);
- dst8_ptr += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t dd0, d0;
- uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10;
- uint16x8_t dd1, dd2, dd3, d1, d2, d3;
- uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif // AOM_ARCH_AARCH64
-
- do {
- int16_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1q_s16(s);
-
- d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
-
- dd0 = vld1q_u16(d);
-
- compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
- int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
- const int16x8_t y_filter, int h, int w) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t d0;
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10;
- uint16x4_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
- load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- src_ptr += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
-
- store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- h -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1_s16(src_ptr);
-
- d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
-
- vst1_u16(dst_ptr, d0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- h--;
-#endif // AOM_ARCH_AARCH64
- } while (h != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t d0;
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10;
- uint16x8_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
- do {
- int16_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- int height = h;
-
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
-
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- s7 = vld1q_s16(s);
-
- d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
-
- vst1q_u16(d, d0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int subpel_y_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
-
- const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
- const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
-
- const int im_h = h + filter_params_y->taps - 1;
- const int im_stride = MAX_SB_SIZE;
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-
- dist_wtd_convolve_2d_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
- x_filter, im_h, w);
-
- if (clamped_y_taps == 6) {
- if (conv_params->do_average) {
- if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
- dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
- im_block + im_stride, im_stride, dst8, dst8_stride, conv_params,
- y_filter, h, w);
- } else {
- dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block + im_stride, im_stride,
- dst8, dst8_stride, conv_params,
- y_filter, h, w);
- }
- } else {
- dist_wtd_convolve_2d_vert_6tap_neon(im_block + im_stride, im_stride,
- conv_params, y_filter, h, w);
- }
- } else {
- if (conv_params->do_average) {
- if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
- dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
- im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
- w);
- } else {
- dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
- dst8_stride, conv_params,
- y_filter, h, w);
- }
- } else {
- dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
- y_filter, h, w);
- }
- }
-}
-
-static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
- const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- uint8x8_t s0, s1, s2, s3, d01, d23;
- uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
- do {
- load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
- d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
- d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
- d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
-
- load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(
- dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
- vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
-
- store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- dst8 += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- } else {
- uint8x8_t s0, s1, s2, s3, d0_u8, d1_u8, d2_u8, d3_u8;
- uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
- do {
- const uint8_t *s = src;
- CONV_BUF_TYPE *d = dst;
- uint8_t *d_u8 = dst8;
- int width = w;
-
- do {
- load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
- d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
- d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
- d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset,
- vreinterpretq_s16_u16(round_offset_vec),
- &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- dst8 += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_2d_copy_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
- const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
-
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- uint8x8_t s0, s1, s2, s3, d01, d23;
- uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
- do {
- load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
- d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
- d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
- d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
-
- load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- vreinterpretq_s16_u16(round_offset_vec), &d01,
- &d23);
-
- store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- dst8 += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- } else {
- uint8x8_t s0, s1, s2, s3, d0_u8, d1_u8, d2_u8, d3_u8;
- uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
- do {
- const uint8_t *s = src;
- CONV_BUF_TYPE *d = dst;
- uint8_t *d_u8 = dst8;
- int width = w;
-
- do {
- load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
- d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
- d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
- d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
- &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- dst8 += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
- int src_stride, int w, int h,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
- const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
-
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- uint8x8_t s0, s1, s2, s3;
- uint16x4_t d0, d1, d2, d3;
-
- do {
- load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
- d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
- d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
- d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
-
- store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- } else {
- uint8x8_t s0, s1, s2, s3;
- uint16x8_t d0, d1, d2, d3;
-
- do {
- const uint8_t *s = src;
- CONV_BUF_TYPE *d = dst;
- int width = w;
-
- do {
- load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
- d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
- d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
- d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
-
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w,
- int h, ConvolveParams *conv_params) {
- if (conv_params->do_average) {
- if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
- dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
- src, src_stride, dst8, dst8_stride, w, h, conv_params);
- } else {
- dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
- h, conv_params);
- }
- } else {
- dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
- }
-}
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE uint16x4_t convolve8_4_x(uint8x16_t samples,
- const int8x8_t x_filter,
- const uint8x16x2_t permute_tbl,
- const int32x4_t round_offset) {
- uint8x16_t permuted_samples[2];
- int32x4_t sum;
-
- // Permute samples ready for dot product.
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- // First 4 output values.
- sum = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
- sum = vusdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
-}
-
-static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
- const int8x8_t x_filter,
- const uint8x16x3_t permute_tbl,
- const int32x4_t round_offset) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum[2];
-
- // Permute samples ready for dot product.
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- // First 4 output values.
- sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
- sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
- // Second 4 output values.
- sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0);
- sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
- // Narrow and re-pack.
- // We halved the convolution filter values so -1 from the right shift.
- int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
- vshrn_n_s32(sum[1], ROUND0_BITS - 1));
- return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
- // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // (The extra -1 is needed because we halved the filter values.)
- const int32x4_t round_offset_shim = vdupq_n_s32(
- (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- uint8_t *dst8_ptr = dst8;
- int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d01_u8, d23_u8;
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_x(s0, x_filter, permute_tbl, round_offset_shim);
- d1 = convolve8_4_x(s1, x_filter, permute_tbl, round_offset_shim);
- d2 = convolve8_4_x(s2, x_filter, permute_tbl, round_offset_shim);
- d3 = convolve8_4_x(s3, x_filter, permute_tbl, round_offset_shim);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
- d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
- d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
- d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d0_u8, &d1_u8,
- &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_x_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
- // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // (The extra -1 is needed because we halved the filter values.)
- const int32x4_t round_offset_shim = vdupq_n_s32(
- (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- uint8_t *dst8_ptr = dst8;
- int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d01_u8, d23_u8;
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_x(s0, x_filter, permute_tbl, round_offset_shim);
- d1 = convolve8_4_x(s1, x_filter, permute_tbl, round_offset_shim);
- d2 = convolve8_4_x(s2, x_filter, permute_tbl, round_offset_shim);
- d3 = convolve8_4_x(s3, x_filter, permute_tbl, round_offset_shim);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
- d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
- d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
- d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_x_neon(
- const uint8_t *src, int src_stride, int w, int h,
- const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // (The extra -1 is needed because we halved the filter values.)
- const int32x4_t round_offset_shim = vdupq_n_s32(
- (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x4_t d0, d1, d2, d3;
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_x(s0, x_filter, permute_tbl, round_offset_shim);
- d1 = convolve8_4_x(s1, x_filter, permute_tbl, round_offset_shim);
- d2 = convolve8_4_x(s2, x_filter, permute_tbl, round_offset_shim);
- d3 = convolve8_4_x(s3, x_filter, permute_tbl, round_offset_shim);
-
- store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x8_t d0, d1, d2, d3;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
- d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
- d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
- d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
-
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint16x4_t convolve8_4_x(uint8x16_t samples,
- const int8x8_t x_filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- // Clamp sample range to [-128, 127] for 8-bit signed dot product.
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- // Permute samples ready for dot product.
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- // Accumulate dot product into 'correction' to account for range clamp.
- sum = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
- // We halved the convolution filter values so -1 from the right shift.
- return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
-}
-
-static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
- const int8x8_t x_filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum[2];
-
- // Clamp sample range to [-128, 127] for 8-bit signed dot product.
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- // Permute samples ready for dot product. */
- // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- // Accumulate dot product into 'correction' to account for range clamp.
- // First 4 output values.
- sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
- sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
- // Second 4 output values.
- sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
- sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
- // Narrow and re-pack.
- // We halved the convolution filter values so -1 from the right shift.
- int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
- vshrn_n_s32(sum[1], ROUND0_BITS - 1));
- return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- // Dot-product constants and other shims.
- const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 = vaddlvq_s16(vshll_n_s8(x_filter, 7));
- // Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
- int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- uint8_t *dst8_ptr = dst8;
- int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d01_u8, d23_u8;
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_x(s0, x_filter, correction, range_limit, permute_tbl);
- d1 = convolve8_4_x(s1, x_filter, correction, range_limit, permute_tbl);
- d2 = convolve8_4_x(s2, x_filter, correction, range_limit, permute_tbl);
- d3 = convolve8_4_x(s3, x_filter, correction, range_limit, permute_tbl);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
- d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
- d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
- d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d0_u8, &d1_u8,
- &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_x_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- // Dot-product constants and other shims.
- const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 = vaddlvq_s16(vshll_n_s8(x_filter, 7));
- // Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
- int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- uint8_t *dst8_ptr = dst8;
- int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d01_u8, d23_u8;
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_x(s0, x_filter, correction, range_limit, permute_tbl);
- d1 = convolve8_4_x(s1, x_filter, correction, range_limit, permute_tbl);
- d2 = convolve8_4_x(s2, x_filter, correction, range_limit, permute_tbl);
- d3 = convolve8_4_x(s3, x_filter, correction, range_limit, permute_tbl);
-
- load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d01_u8, &d23_u8);
-
- store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
- store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
- store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
- store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
- d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
- d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
- d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_x_neon(
- const uint8_t *src, int src_stride, int w, int h,
- const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- // Dot-product constants and other shims.
- const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 = vaddlvq_s16(vshll_n_s8(x_filter, 7));
- // Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
- int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int height = h;
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x4_t d0, d1, d2, d3;
-
- load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_x(s0, x_filter, correction, range_limit, permute_tbl);
- d1 = convolve8_4_x(s1, x_filter, correction, range_limit, permute_tbl);
- d2 = convolve8_4_x(s2, x_filter, correction, range_limit, permute_tbl);
- d3 = convolve8_4_x(s3, x_filter, correction, range_limit, permute_tbl);
-
- store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- int width = w;
-
- do {
- uint8x16_t s0, s1, s2, s3;
- uint16x8_t d0, d1, d2, d3;
-
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
- d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
- d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
- d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
-
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- }
-}
-
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE uint16x4_t convolve8_4_x(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t x_filter,
- const int16x4_t round_offset) {
- const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
- const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
- int16x4_t sum = vmul_lane_s16(s0, x_filter_0_3, 0);
- sum = vmla_lane_s16(sum, s1, x_filter_0_3, 1);
- sum = vmla_lane_s16(sum, s2, x_filter_0_3, 2);
- sum = vmla_lane_s16(sum, s3, x_filter_0_3, 3);
- sum = vmla_lane_s16(sum, s4, x_filter_4_7, 0);
- sum = vmla_lane_s16(sum, s5, x_filter_4_7, 1);
- sum = vmla_lane_s16(sum, s6, x_filter_4_7, 2);
- sum = vmla_lane_s16(sum, s7, x_filter_4_7, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
- return vreinterpret_u16_s16(res);
-}
-
-static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t x_filter,
- const int16x8_t round_offset) {
- const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
- const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
- int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
- sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
- sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
- sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
- sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
- sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
- sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
- sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
- return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- uint8_t *dst8_ptr = dst8;
- int dst_stride = conv_params->dst_stride;
- const uint8_t *s;
- uint8_t *d_u8;
- CONV_BUF_TYPE *d;
- int width;
- int height = h;
-
- uint8x8_t t0;
-#if AOM_ARCH_AARCH64
- uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x4_t d0, dd0;
- uint8x8_t d01;
-#if AOM_ARCH_AARCH64
- int16x4_t s9, s10;
- uint16x4_t d1, d2, d3, dd1, dd2, dd3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- do {
- d = dst_ptr;
- d_u8 = dst8_ptr;
- width = w;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
-
- load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- s = src_ptr + 7;
-
- do {
- load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
- transpose_u8_4x4(&t0, &t1);
-
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s10 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-
- d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve8_4_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve8_4_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve8_4_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- vget_low_s16(round_offset_vec));
-
- transpose_u16_4x4d(&d0, &d1, &d2, &d3);
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- __builtin_prefetch(d_u8 + 0 * dst8_stride);
- __builtin_prefetch(d_u8 + 1 * dst8_stride);
- __builtin_prefetch(d_u8 + 2 * dst8_stride);
- __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
- load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d01, &d23);
-
- store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4;
- d += 4;
- d_u8 += 4;
- width -= 4;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- __builtin_prefetch(d);
-
- s = src_ptr + 8;
-
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
-
- d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- vget_low_s16(round_offset_vec));
-
- __builtin_prefetch(d);
- __builtin_prefetch(d_u8);
-
- dd0 = vld1_u16(d);
-
- compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
- vget_low_s16(round_offset_vec), &d01);
-
- store_u8_4x1(d_u8, d01, 0);
-
- s0 = s4;
- s4 = s8;
- s += 4;
- d += 4;
- d_u8 += 4;
- width -= 4;
- } while (width != 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- dst8_ptr += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x8_t d0, dd0;
- uint8x8_t d0_u8;
-
- do {
- d = dst_ptr;
- d_u8 = dst8_ptr;
- width = w;
-
-#if AOM_ARCH_AARCH64
- int16x8_t s9, s10, s11, s12, s13, s14;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
-
- load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
- __builtin_prefetch(dst_ptr + 4 * dst_stride);
- __builtin_prefetch(dst_ptr + 5 * dst_stride);
- __builtin_prefetch(dst_ptr + 6 * dst_stride);
- __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
- s = src_ptr + 7;
-
- do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- round_offset_vec);
- d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- round_offset_vec);
- d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- round_offset_vec);
- d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- round_offset_vec);
- d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- round_offset_vec);
- d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
- round_offset_vec);
- d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
- round_offset_vec);
- d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
- round_offset_vec);
-
- transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d0_u8, &d1_u8,
- &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
- compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
- bck_offset, round_offset_vec, &d4_u8, &d5_u8,
- &d6_u8, &d7_u8);
-
- store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
- d7_u8);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 8 * src_stride;
- dst_ptr += 8 * dst_stride;
- dst8_ptr += 8 * dst8_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr);
-
- t0 = vld1_u8(src_ptr);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
-
- __builtin_prefetch(dst_ptr);
-
- s = src_ptr + 8;
-
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
- s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
-
- d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- round_offset_vec);
-
- dd0 = vld1q_u16(d);
-
- compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
- round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
-
- s0 = s8;
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- dst8_ptr += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_x_avg_neon(
- const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
- int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- uint8_t *dst8_ptr = dst8;
- int dst_stride = conv_params->dst_stride;
- const uint8_t *s;
- uint8_t *d_u8;
- CONV_BUF_TYPE *d;
- int width;
- int height = h;
-
- uint8x8_t t0;
-#if AOM_ARCH_AARCH64
- uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x4_t d0, dd0;
- uint8x8_t d01;
-#if AOM_ARCH_AARCH64
- int16x4_t s9, s10;
- uint16x4_t d1, d2, d3, dd1, dd2, dd3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- do {
- d = dst_ptr;
- d_u8 = dst8_ptr;
- width = w;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
-
- load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- s = src_ptr + 7;
-
- do {
- load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
- transpose_u8_4x4(&t0, &t1);
-
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s10 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-
- d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve8_4_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve8_4_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve8_4_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- vget_low_s16(round_offset_vec));
-
- transpose_u16_4x4d(&d0, &d1, &d2, &d3);
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- __builtin_prefetch(d_u8 + 0 * dst8_stride);
- __builtin_prefetch(d_u8 + 1 * dst8_stride);
- __builtin_prefetch(d_u8 + 2 * dst8_stride);
- __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
- load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d01, &d23);
-
- store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4;
- d += 4;
- d_u8 += 4;
- width -= 4;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- dst8_ptr += 4 * dst8_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- __builtin_prefetch(d);
-
- s = src_ptr + 8;
-
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
-
- d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- vget_low_s16(round_offset_vec));
-
- __builtin_prefetch(d);
- __builtin_prefetch(d_u8);
-
- dd0 = vld1_u16(d);
-
- compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
-
- store_u8_4x1(d_u8, d01, 0);
-
- s0 = s4;
- s4 = s8;
- s += 4;
- d += 4;
- d_u8 += 4;
- width -= 4;
- } while (width != 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- dst8_ptr += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x8_t d0, dd0;
- uint8x8_t d0_u8;
-
- do {
- d = dst_ptr;
- d_u8 = dst8_ptr;
- width = w;
-
-#if AOM_ARCH_AARCH64
- int16x8_t s9, s10, s11, s12, s13, s14;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
-
- load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
- __builtin_prefetch(dst_ptr + 4 * dst_stride);
- __builtin_prefetch(dst_ptr + 5 * dst_stride);
- __builtin_prefetch(dst_ptr + 6 * dst_stride);
- __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
- s = src_ptr + 7;
-
- do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- round_offset_vec);
- d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- round_offset_vec);
- d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- round_offset_vec);
- d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- round_offset_vec);
- d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- round_offset_vec);
- d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
- round_offset_vec);
- d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
- round_offset_vec);
- d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
- round_offset_vec);
-
- transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
- load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
- compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
- round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
-
- store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
- d7_u8);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 8 * src_stride;
- dst_ptr += 8 * dst_stride;
- dst8_ptr += 8 * dst8_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr);
-
- t0 = vld1_u8(src_ptr);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
-
- __builtin_prefetch(dst_ptr);
-
- s = src_ptr + 8;
-
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
- s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
-
- d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- round_offset_vec);
-
- dd0 = vld1q_u16(d);
-
- compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
-
- s0 = s8;
- s += 8;
- d += 8;
- d_u8 += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- dst8_ptr += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_x_neon(
- const uint8_t *src, int src_stride, int w, int h,
- const InterpFilterParams *filter_params_x, const int subpel_x_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- // Horizontal filter.
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - horiz_offset;
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const uint8_t *s;
- CONV_BUF_TYPE *d;
- int width;
- int height = h;
-
- uint8x8_t t0;
-#if AOM_ARCH_AARCH64
- uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x4_t d0;
-#if AOM_ARCH_AARCH64
- int16x4_t s9, s10;
- uint16x4_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
- do {
- d = dst_ptr;
- width = w;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
-
- load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- s = src_ptr + 7;
-
- do {
- load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
- transpose_u8_4x4(&t0, &t1);
-
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s10 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-
- d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve8_4_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve8_4_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve8_4_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- vget_low_s16(round_offset_vec));
-
- transpose_u16_4x4d(&d0, &d1, &d2, &d3);
-
- store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4;
- d += 4;
- width -= 4;
- } while (width != 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- __builtin_prefetch(d);
-
- s = src_ptr + 8;
-
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
-
- d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- vget_low_s16(round_offset_vec));
-
- vst1_u16(d, d0);
-
- s0 = s4;
- s4 = s8;
- s += 4;
- d += 4;
- width -= 4;
- } while (width != 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- uint16x8_t d0;
-
- do {
- d = dst_ptr;
- width = w;
-
-#if AOM_ARCH_AARCH64
- int16x8_t s9, s10, s11, s12, s13, s14;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
-
- load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
- __builtin_prefetch(dst_ptr + 4 * dst_stride);
- __builtin_prefetch(dst_ptr + 5 * dst_stride);
- __builtin_prefetch(dst_ptr + 6 * dst_stride);
- __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
- s = src_ptr + 7;
-
- do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- round_offset_vec);
- d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- round_offset_vec);
- d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- round_offset_vec);
- d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- round_offset_vec);
- d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- round_offset_vec);
- d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
- round_offset_vec);
- d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
- round_offset_vec);
- d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
- round_offset_vec);
-
- transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
- store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += 8 * src_stride;
- dst_ptr += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- __builtin_prefetch(src_ptr);
-
- t0 = vld1_u8(src_ptr);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
-
- __builtin_prefetch(dst_ptr);
-
- s = src_ptr + 8;
-
- do {
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
- s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
-
- d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- round_offset_vec);
-
- vst1q_u16(d, d0);
-
- s0 = s8;
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- }
-}
-
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const int subpel_x_qn,
- ConvolveParams *conv_params) {
- if (conv_params->do_average) {
- if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
- dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
- w, h, filter_params_x, subpel_x_qn,
- conv_params);
- } else {
- dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
- filter_params_x, subpel_x_qn, conv_params);
- }
- } else {
- dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
- subpel_x_qn, conv_params);
- }
-}
-
-static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x8_t y_filter,
- const int16x4_t round_offset) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- // Filter values at indices 0 and 7 are 0.
- int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
- sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
- sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
- sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
- sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
- sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
-
- // We halved the convolution filter values so -1 from the right shift.
- int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
- return vreinterpret_u16_s16(res);
-}
-
-static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t y_filter,
- const int16x8_t round_offset) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- // Filter values at indices 0 and 7 are 0.
- int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
- sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
- sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
- sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
- sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
- sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
-
- // We halved the convolution filter values so -1 from the right shift.
- int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
- return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
- const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
- const int dst8_stride, int w, int h, const int16x8_t y_filter,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int width = w;
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5;
- uint16x4_t d0, dd0;
- uint8x8_t t0, t1, t2, t3, t4, d01;
-#if AOM_ARCH_AARCH64
- int16x4_t s6, s7, s8;
- uint16x4_t d1, d2, d3, dd1, dd2, dd3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
- t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-
- s += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
- vget_low_s16(round_offset_vec));
-
- load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d01, &d23);
-
- store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- d_u8 += 4 * dst8_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s);
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
- d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
- vget_low_s16(round_offset_vec));
-
- dd0 = vld1_u16(d);
-
- compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
- vget_low_s16(round_offset_vec), &d01);
-
- store_u8_4x1(d_u8, d01, 0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- d_u8 += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 4;
- dst_ptr += 4;
- dst8_ptr += 4;
- width -= 4;
- } while (width != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5;
- uint16x8_t d0, dd0;
- uint8x8_t d0_u8, t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
- int16x8_t s6, s7, s8, s9, s10, s11, s12;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr + (5 * src_stride);
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-
- do {
-#if AOM_ARCH_AARCH64
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
- d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
- d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
- d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
- d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
- d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
- d6 =
- convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
- d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
- round_offset_vec);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d0_u8, &d1_u8,
- &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
- compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
- bck_offset, round_offset_vec, &d4_u8, &d5_u8,
- &d6_u8, &d7_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s += 8 * src_stride;
- d += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
- d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
-
- dd0 = vld1q_u16(d);
-
- compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
- round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- width -= 8;
- } while (width != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_y_6tap_avg_neon(
- const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
- const int dst8_stride, int w, int h, const int16x8_t y_filter,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int width = w;
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5;
- uint16x4_t d0, dd0;
- uint8x8_t t0, t1, t2, t3, t4, d01;
-#if AOM_ARCH_AARCH64
- int16x4_t s6, s7, s8;
- uint16x4_t d1, d2, d3, dd1, dd2, dd3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
- t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-
- s += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
- vget_low_s16(round_offset_vec));
-
- load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d01, &d23);
-
- store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- d_u8 += 4 * dst8_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s);
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
- d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
- vget_low_s16(round_offset_vec));
-
- dd0 = vld1_u16(d);
-
- compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
-
- store_u8_4x1(d_u8, d01, 0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- d_u8 += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 4;
- dst_ptr += 4;
- dst8_ptr += 4;
- width -= 4;
- } while (width != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5;
- uint16x8_t d0, dd0;
- uint8x8_t d0_u8, t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
- int16x8_t s6, s7, s8, s9, s10, s11, s12;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr + (5 * src_stride);
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-
- do {
-#if AOM_ARCH_AARCH64
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
- d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
- d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
- d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
- d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
- d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
- d6 =
- convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
- d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
- round_offset_vec);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
- compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
- round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s += 8 * src_stride;
- d += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
- d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
-
- dd0 = vld1q_u16(d);
-
- compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- width -= 8;
- } while (width != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
- int src_stride, int w, int h,
- const int16x8_t y_filter,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int width = w;
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5;
- uint16x4_t d0;
- uint8x8_t t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
- int16x4_t s6, s7, s8;
- uint16x4_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- int height = h;
-
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
- t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-
- s += 5 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
- vget_low_s16(round_offset_vec));
-
- store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s);
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
- d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
- vget_low_s16(round_offset_vec));
-
- vst1_u16(d, d0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 4;
- dst_ptr += 4;
- width -= 4;
- } while (width != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5;
- uint16x8_t d0;
- uint8x8_t t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
- int16x8_t s6, s7, s8, s9, s10, s11, s12;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7;
- uint8x8_t t5, t6, t7;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr + (5 * src_stride);
- CONV_BUF_TYPE *d = dst_ptr;
- int height = h;
-
- load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-
- do {
-#if AOM_ARCH_AARCH64
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
- d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
- d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
- d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
- d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
- d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
- d6 =
- convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
- d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
- round_offset_vec);
-
- store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s += 8 * src_stride;
- d += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
- d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
-
- vst1q_u16(d, d0);
-
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- width -= 8;
- } while (width != 0);
- }
-}
-
-static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t y_filter,
- const int16x4_t round_offset) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
- sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
- sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
- sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
- sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
- sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
- sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
- sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
- return vreinterpret_u16_s16(res);
-}
-
-static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t y_filter,
- const int16x8_t round_offset) {
- const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
- const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
- int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
- sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
- sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
- sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
- sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
- sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
- sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
- sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
-
- // We halved the convolution filter values so -1 from the right shift.
- int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
- return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
- const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
- const int dst8_stride, int w, int h, const int16x8_t y_filter,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int width = w;
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t d0, dd0;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01;
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10;
- uint16x4_t d1, d2, d3, dd1, dd2, dd3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
- t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
- t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
- t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
-
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- vget_low_s16(round_offset_vec));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- __builtin_prefetch(d_u8 + 0 * dst8_stride);
- __builtin_prefetch(d_u8 + 1 * dst8_stride);
- __builtin_prefetch(d_u8 + 2 * dst8_stride);
- __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
- load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d01, &d23);
-
- store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- d_u8 += 4 * dst8_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s);
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
- d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
-
- __builtin_prefetch(d);
-
- dd0 = vld1_u16(d);
-
- compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
- vget_low_s16(round_offset_vec), &d01);
-
- store_u8_4x1(d_u8, d01, 0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- s += src_stride;
- d += dst_stride;
- d_u8 += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 4;
- dst_ptr += 4;
- dst8_ptr += 4;
- width -= 4;
- } while (width != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t d0, dd0;
- uint8x8_t d0_u8, t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10, s11, s12, s13, s14;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t7;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- __builtin_prefetch(s + 4 * src_stride);
- __builtin_prefetch(s + 5 * src_stride);
- __builtin_prefetch(s + 6 * src_stride);
- __builtin_prefetch(s + 7 * src_stride);
- load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
- d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_offset_vec);
- d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- round_offset_vec);
- d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- round_offset_vec);
- d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- round_offset_vec);
- d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
- round_offset_vec);
- d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
- round_offset_vec);
- d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
- round_offset_vec);
- d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
- round_offset_vec);
-
- __builtin_prefetch(d + 0 * dst8_stride);
- __builtin_prefetch(d + 1 * dst8_stride);
- __builtin_prefetch(d + 2 * dst8_stride);
- __builtin_prefetch(d + 3 * dst8_stride);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
- bck_offset, round_offset_vec, &d0_u8, &d1_u8,
- &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
- compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
- bck_offset, round_offset_vec, &d4_u8, &d5_u8,
- &d6_u8, &d7_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8 * src_stride;
- d += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
- __builtin_prefetch(dst_ptr);
-
- d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_offset_vec);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
-
- __builtin_prefetch(d);
-
- dd0 = vld1q_u16(d);
-
- compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
- round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- width -= 8;
- } while (width != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_y_8tap_avg_neon(
- const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
- const int dst8_stride, int w, int h, const int16x8_t y_filter,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int width = w;
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t d0, dd0;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01;
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10;
- uint16x4_t d1, d2, d3, dd1, dd2, dd3;
- uint8x8_t d23;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
- t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
- t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
- t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
-
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- vget_low_s16(round_offset_vec));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- __builtin_prefetch(d_u8 + 0 * dst8_stride);
- __builtin_prefetch(d_u8 + 1 * dst8_stride);
- __builtin_prefetch(d_u8 + 2 * dst8_stride);
- __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
- load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d01, &d23);
-
- store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
- store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
- store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
- store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- d_u8 += 4 * dst8_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s);
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
- d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
-
- __builtin_prefetch(d);
-
- dd0 = vld1_u16(d);
-
- compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
-
- store_u8_4x1(d_u8, d01, 0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- s += src_stride;
- d += dst_stride;
- d_u8 += dst8_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 4;
- dst_ptr += 4;
- dst8_ptr += 4;
- width -= 4;
- } while (width != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t d0, dd0;
- uint8x8_t d0_u8, t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10, s11, s12, s13, s14;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t7;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- uint8_t *d_u8 = dst8_ptr;
- int height = h;
-
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- __builtin_prefetch(s + 4 * src_stride);
- __builtin_prefetch(s + 5 * src_stride);
- __builtin_prefetch(s + 6 * src_stride);
- __builtin_prefetch(s + 7 * src_stride);
- load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
- d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_offset_vec);
- d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- round_offset_vec);
- d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- round_offset_vec);
- d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- round_offset_vec);
- d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
- round_offset_vec);
- d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
- round_offset_vec);
- d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
- round_offset_vec);
- d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
- round_offset_vec);
-
- __builtin_prefetch(d + 0 * dst8_stride);
- __builtin_prefetch(d + 1 * dst8_stride);
- __builtin_prefetch(d + 2 * dst8_stride);
- __builtin_prefetch(d + 3 * dst8_stride);
-
- load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
- compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
- round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
- d_u8 += 4 * dst8_stride;
-
- load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
- compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
- round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
-
- store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
- d_u8 += 4 * dst8_stride;
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8 * src_stride;
- d += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
- __builtin_prefetch(dst_ptr);
-
- d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_offset_vec);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
-
- __builtin_prefetch(d);
-
- dd0 = vld1q_u16(d);
-
- compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
- vst1_u8(d_u8, d0_u8);
- d_u8 += dst8_stride;
-
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- dst8_ptr += 8;
- width -= 8;
- } while (width != 0);
- }
-}
-
-static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
- int src_stride, int w, int h,
- const int16x8_t y_filter,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
- const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
- (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
- const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- int width = w;
-
- if (w == 4 || h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t d0;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
- int16x4_t s8, s9, s10;
- uint16x4_t d1, d2, d3;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- int height = h;
-
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
- t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
- t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
- t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
-
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
-
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
- t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
- t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
- t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
- d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- vget_low_s16(round_offset_vec));
- d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- vget_low_s16(round_offset_vec));
- d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- vget_low_s16(round_offset_vec));
-
- store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
-#else // !AOM_ARCH_AARCH64
- t0 = load_unaligned_u8_4x1(s);
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
- d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- vget_low_s16(round_offset_vec));
-
- vst1_u16(d, d0);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 4;
- dst_ptr += 4;
- width -= 4;
- } while (width != 0);
- } else {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x8_t d0;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
- int16x8_t s8, s9, s10, s11, s12, s13, s14;
- uint16x8_t d1, d2, d3, d4, d5, d6, d7;
- uint8x8_t t7;
-#endif // AOM_ARCH_AARCH64
-
- do {
- const uint8_t *s = src_ptr;
- CONV_BUF_TYPE *d = dst_ptr;
- int height = h;
-
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- __builtin_prefetch(s + 4 * src_stride);
- __builtin_prefetch(s + 5 * src_stride);
- __builtin_prefetch(s + 6 * src_stride);
- __builtin_prefetch(s + 7 * src_stride);
- load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s += 7 * src_stride;
-
- do {
-#if AOM_ARCH_AARCH64
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
- d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_offset_vec);
- d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- round_offset_vec);
- d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- round_offset_vec);
- d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- round_offset_vec);
- d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
- round_offset_vec);
- d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
- round_offset_vec);
- d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
- round_offset_vec);
- d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
- round_offset_vec);
-
- store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8 * src_stride;
- d += 8 * dst_stride;
- height -= 8;
-#else // !AOM_ARCH_AARCH64
- s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
- __builtin_prefetch(dst_ptr);
-
- d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_offset_vec);
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
-
- vst1q_u16(d, d0);
-
- s += src_stride;
- d += dst_stride;
- height--;
-#endif // AOM_ARCH_AARCH64
- } while (height != 0);
- src_ptr += 8;
- dst_ptr += 8;
- width -= 8;
- } while (width != 0);
- }
-}
-
-void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_y,
- const int subpel_y_qn,
- ConvolveParams *conv_params) {
- assert(w % 4 == 0);
- assert(h % 4 == 0);
-
- // Vertical filter.
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
- // Filter values are even, so downshift by 1 to reduce intermediate
- // precision requirements.
- const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
-
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const uint8_t *src_ptr = src - (vert_offset * src_stride);
-
- if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
- if (conv_params->do_average) {
- if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
- dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
- src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
- conv_params);
- } else {
- dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
- dst8, dst8_stride, w, h, y_filter,
- conv_params);
- }
- } else {
- dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
- y_filter, conv_params);
- }
- } else {
- if (conv_params->do_average) {
- if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
- dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
- dst8_stride, w, h, y_filter,
- conv_params);
- } else {
- dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
- dst8_stride, w, h, y_filter,
- conv_params);
- }
- } else {
- dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
- conv_params);
- }
- }
-}
diff --git a/av1/common/arm/reconinter_neon.c b/av1/common/arm/reconinter_neon.c
index 3694763d0..2b0274cc6 100644
--- a/av1/common/arm/reconinter_neon.c
+++ b/av1/common/arm/reconinter_neon.c
@@ -12,6 +12,7 @@
#include <arm_neon.h>
#include <assert.h>
+#include <stdbool.h>
#include "aom/aom_integer.h"
#include "aom_dsp/blend.h"
@@ -20,6 +21,93 @@
#include "av1/common/blockd.h"
#include "config/av1_rtcd.h"
+static AOM_INLINE void diffwtd_mask_d16_neon(
+ uint8_t *mask, const bool inverse, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round));
+
+ if (w >= 16) {
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0 + j);
+ uint16x8_t s1_lo = vld1q_u16(src1 + j);
+ uint16x8_t s0_hi = vld1q_u16(src0 + j + 8);
+ uint16x8_t s1_hi = vld1q_u16(src1 + j + 8);
+
+ uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec);
+ uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ j += 16;
+ } while (j < w);
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (++i < h);
+ } else if (w == 8) {
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0);
+ uint16x8_t s1 = vld1q_u16(src1);
+
+ uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
+ }
+
+ vst1_u8(mask, m);
+
+ mask += 8;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (++i < h);
+ } else if (w == 4) {
+ int i = 0;
+ do {
+ uint16x8_t s0 =
+ vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride));
+ uint16x8_t s1 =
+ vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride));
+
+ uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
+ }
+
+ vst1_u8(mask, m);
+
+ mask += 8;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ i += 2;
+ } while (i < h);
+ }
+}
+
void av1_build_compound_diffwtd_mask_d16_neon(
uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
@@ -27,60 +115,103 @@ void av1_build_compound_diffwtd_mask_d16_neon(
assert(h >= 4);
assert(w >= 4);
assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
- const int round =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
- uint16x8_t diff_q, tmp0, tmp1;
- uint8x8_t diff_d, diff_select;
- const CONV_BUF_TYPE *src0_1, *src1_1;
- const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
- const uint8x8_t dup_38 = vdup_n_u8(38);
- const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+
if (mask_type == DIFFWTD_38) {
- diff_select = vdup_n_u8(255);
- } else {
- diff_select = vdup_n_u8(0);
+ diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
+ src1_stride, h, w, conv_params, bd);
+ } else { // mask_type == DIFFWTD_38_INV
+ diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
+ src1_stride, h, w, conv_params, bd);
}
- if (w >= 8) {
- for (int i = 0; i < h; ++i) {
- src0_1 = src0;
- src1_1 = src1;
- for (int j = 0; j < w; j += 8) {
- __builtin_prefetch(src0_1);
- __builtin_prefetch(src1_1);
- diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
- diff_q = vrshlq_u16(diff_q, dup_round);
- diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
- diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
- diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
- vst1_u8(mask, diff_d);
- src0_1 += 8;
- src1_1 += 8;
- mask += 8;
- }
+}
+
+static AOM_INLINE void diffwtd_mask_neon(uint8_t *mask, const bool inverse,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ if (w >= 16) {
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src0 + j);
+ uint8x16_t s1 = vld1q_u8(src1 + j);
+
+ uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ j += 16;
+ } while (j < w);
src0 += src0_stride;
src1 += src1_stride;
- }
+ } while (++i < h);
+ } else if (w == 8) {
+ int i = 0;
+ do {
+ uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride));
+ uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride));
+
+ uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ i += 2;
+ } while (i < h);
} else if (w == 4) {
- for (int i = 0; i < h; i += 2) {
- src0_1 = src0;
- src1_1 = src1;
- __builtin_prefetch(src0_1 + 0 * src0_stride);
- __builtin_prefetch(src0_1 + 1 * src0_stride);
- __builtin_prefetch(src1_1 + 0 * src1_stride);
- __builtin_prefetch(src1_1 + 1 * src1_stride);
- tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
- vld1_u16(src0_1 + (1 * src0_stride)));
- tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
- vld1_u16(src1_1 + (1 * src1_stride)));
- diff_q = vabdq_u16(tmp0, tmp1);
- diff_q = vrshlq_u16(diff_q, dup_round);
- diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
- diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
- diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
- vst1_u8(mask, diff_d);
- src0 += src0_stride * 2;
- src1 += src1_stride * 2;
- mask += w * 2;
- }
+ int i = 0;
+ do {
+ uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride);
+ uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride);
+
+ uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ src0 += 4 * src0_stride;
+ src1 += 4 * src1_stride;
+ i += 4;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_neon(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ assert(h % 4 == 0);
+ assert(w % 4 == 0);
+ assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
+
+ if (mask_type == DIFFWTD_38) {
+ diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
+ src1_stride, h, w);
+ } else { // mask_type == DIFFWTD_38_INV
+ diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
+ src1_stride, h, w);
}
}
diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c
index 8d190fb54..cf488a9ca 100644
--- a/av1/common/arm/reconintra_neon.c
+++ b/av1/common/arm/reconintra_neon.c
@@ -17,6 +17,8 @@
#include "aom/aom_integer.h"
#include "aom_dsp/arm/sum_neon.h"
+#define MAX_UPSAMPLE_SZ 16
+
DECLARE_ALIGNED(16, const int8_t,
av1_filter_intra_taps_neon[FILTER_INTRA_MODES][8][8]) = {
{
@@ -153,3 +155,185 @@ void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
dst += stride;
}
}
+
+void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
+ if (!strength) return;
+ assert(sz >= 0 && sz <= 129);
+
+ uint8_t edge[160]; // Max value of sz + enough padding for vector accesses.
+ memcpy(edge + 1, p, sz * sizeof(*p));
+
+ // Populate extra space appropriately.
+ edge[0] = edge[1];
+ edge[sz + 1] = edge[sz];
+ edge[sz + 2] = edge[sz];
+
+ // Don't overwrite first pixel.
+ uint8_t *dst = p + 1;
+ sz--;
+
+ if (strength == 1) { // Filter: {4, 8, 4}.
+ const uint8_t *src = edge + 1;
+
+ while (sz >= 8) {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ // Make use of the identity:
+ // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+ uint16x8_t t0 = vaddl_u8(s0, s2);
+ uint16x8_t t1 = vaddl_u8(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 2);
+
+ vst1_u8(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ uint16x8_t t0 = vaddl_u8(s0, s2);
+ uint16x8_t t1 = vaddl_u8(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 2);
+
+ // Mask off out-of-bounds indices.
+ uint8x8_t current_dst = vld1_u8(dst);
+ uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+ res = vbsl_u8(mask, res, current_dst);
+
+ vst1_u8(dst, res);
+ }
+ } else if (strength == 2) { // Filter: {5, 6, 5}.
+ const uint8_t *src = edge + 1;
+
+ const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } };
+
+ while (sz >= 8) {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ uint16x8_t accum = vmull_u8(s0, filter.val[0]);
+ accum = vmlal_u8(accum, s1, filter.val[1]);
+ accum = vmlal_u8(accum, s2, filter.val[2]);
+ uint8x8_t res = vrshrn_n_u16(accum, 4);
+
+ vst1_u8(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ uint16x8_t accum = vmull_u8(s0, filter.val[0]);
+ accum = vmlal_u8(accum, s1, filter.val[1]);
+ accum = vmlal_u8(accum, s2, filter.val[2]);
+ uint8x8_t res = vrshrn_n_u16(accum, 4);
+
+ // Mask off out-of-bounds indices.
+ uint8x8_t current_dst = vld1_u8(dst);
+ uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+ res = vbsl_u8(mask, res, current_dst);
+
+ vst1_u8(dst, res);
+ }
+ } else { // Filter {2, 4, 4, 4, 2}.
+ const uint8_t *src = edge;
+
+ while (sz >= 8) {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+ uint8x8_t s3 = vld1_u8(src + 3);
+ uint8x8_t s4 = vld1_u8(src + 4);
+
+ // Make use of the identity:
+ // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+ uint16x8_t t0 = vaddl_u8(s0, s4);
+ uint16x8_t t1 = vaddl_u8(s1, s2);
+ t1 = vaddw_u8(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 3);
+
+ vst1_u8(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+ uint8x8_t s3 = vld1_u8(src + 3);
+ uint8x8_t s4 = vld1_u8(src + 4);
+
+ uint16x8_t t0 = vaddl_u8(s0, s4);
+ uint16x8_t t1 = vaddl_u8(s1, s2);
+ t1 = vaddw_u8(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 3);
+
+ // Mask off out-of-bounds indices.
+ uint8x8_t current_dst = vld1_u8(dst);
+ uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+ res = vbsl_u8(mask, res, current_dst);
+
+ vst1_u8(dst, res);
+ }
+ }
+}
+
+void av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
+ if (!sz) return;
+
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint8_t edge[MAX_UPSAMPLE_SZ + 3];
+ const uint8_t *src = edge;
+
+ // Copy p[-1..(sz-1)] and pad out both ends.
+ edge[0] = p[-1];
+ edge[1] = p[-1];
+ memcpy(edge + 2, p, sz);
+ edge[sz + 2] = p[sz - 1];
+ p[-2] = p[-1];
+
+ uint8_t *dst = p - 1;
+
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+ uint8x8_t s3 = vld1_u8(src + 3);
+
+ int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
+ int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
+ t1 = vmulq_n_s16(t1, 9);
+ t1 = vsubq_s16(t1, t0);
+
+ uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } };
+
+ vst2_u8(dst, res);
+
+ src += 8;
+ dst += 16;
+ sz -= 8;
+ } while (sz > 0);
+}
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index 5f6d21416..076981bfe 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -15,10 +15,61 @@
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "av1/common/resize.h"
-#include "av1/common/arm/convolve_neon.h"
#include "config/av1_rtcd.h"
#include "config/aom_scale_rtcd.h"
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+ const int16x8_t filter) {
+ int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+ int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+ int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+ int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+ return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter);
+}
+
static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
const int src_stride,
uint8_t *dst,
@@ -192,15 +243,16 @@ static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
do {
load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
&s[6], &s[7]);
- transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
x = width_hor;
do {
src += 8;
load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
&s[12], &s[13]);
- transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
- &s[13]);
+ transpose_elems_inplace_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71
@@ -210,7 +262,7 @@ static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
// 10 11 12 13 50 51 52 53
// 20 21 22 23 60 61 62 63
// 30 31 32 33 70 71 72 73
- transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+ transpose_elems_inplace_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
0);
vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
@@ -308,7 +360,8 @@ static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
do {
load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
&s[6], &s[7]);
- transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+ transpose_elems_u8_4x8(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
+ &s[0], &s[1], &s[2], &s[3]);
x = width_hor;
do {
@@ -316,8 +369,8 @@ static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
src += 8;
load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
&s[10], &s[11]);
- transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
- &s[11]);
+ transpose_elems_inplace_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71
@@ -453,14 +506,16 @@ static void scale_plane_4_to_3_bilinear(const uint8_t *src,
load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
&s[6], &s[7]);
src += 1;
- transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
x = width_hor;
do {
load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
&s[7], &s[8]);
src += 8;
- transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+ transpose_elems_inplace_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
// 00 10 20 30 40 50 60 70
// 01 11 21 31 41 51 61 71
@@ -487,7 +542,8 @@ static void scale_plane_4_to_3_bilinear(const uint8_t *src,
// 50 51 52 53 54 55 xx xx
// 60 61 62 63 64 65 xx xx
// 70 71 72 73 74 75 xx xx
- transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+ &d[6], &d[7]);
// store 2 extra pixels
vst1_u8(t + 0 * stride_hor, d[0]);
vst1_u8(t + 1 * stride_hor, d[1]);
@@ -586,15 +642,16 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
do {
load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
&s[6], &s[7]);
- transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
x = width_hor;
do {
src += 8;
load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
&s[13], &s[14]);
- transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
- &s[14]);
+ transpose_elems_inplace_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13], &s[14]);
// 00 10 20 30 40 50 60 70
// 01 11 21 31 41 51 61 71
@@ -619,7 +676,8 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
// 50 51 52 53 54 55 xx xx
// 60 61 62 63 64 65 xx xx
// 70 71 72 73 74 75 xx xx
- transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+ &d[6], &d[7]);
// store 2 extra pixels
vst1_u8(t + 0 * stride_hor, d[0]);
vst1_u8(t + 1 * stride_hor, d[1]);
@@ -828,3 +886,293 @@ void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
aom_extend_frame_borders(dst, num_planes);
}
}
+
+static INLINE void scaledconvolve_horiz_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ y = h;
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x8_t ss[4];
+ int16x4_t t[8], tt;
+
+ load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+ transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ t[0] = vget_low_s16(ss[0]);
+ t[1] = vget_low_s16(ss[1]);
+ t[2] = vget_low_s16(ss[2]);
+ t[3] = vget_low_s16(ss[3]);
+ t[4] = vget_high_s16(ss[0]);
+ t[5] = vget_high_s16(ss[1]);
+ t[6] = vget_high_s16(ss[2]);
+ t[7] = vget_high_s16(ss[3]);
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+ filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ store_u8_4x1(&temp[4 * z], d, 0);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ {
+ const uint8x8x4_t d4 = vld4_u8(temp);
+ store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0], 0);
+ store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1], 0);
+ store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2], 0);
+ store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3], 0);
+ }
+ x += 4;
+ } while (x < w);
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ y -= 4;
+ } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = (h + 7) & ~7;
+
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ uint8x8_t d[8];
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8];
+ load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+ &s[5], &s[6], &s[7]);
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4],
+ &s[5], &s[6], &s[7]);
+ d[0] = scale_filter_8(s, filters);
+ vst1_u8(&temp[8 * z], d[0]);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+ &d[6], &d[7]);
+ store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
+ d[6], d[7]);
+ x += 8;
+ } while (x < w);
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x4_t t[8], tt;
+
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+ t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+ t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+ t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+ t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+ t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+ t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+ t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ store_u8_4x1(dst, d, 0);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ d = scale_filter_8(s, filters);
+ vst1_u8(dst, d);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int x, y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ x = 0;
+ do {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x16_t ss[8];
+ uint8x8_t s[8], d[2];
+ load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+ &ss[5], &ss[6], &ss[7]);
+ s[0] = vget_low_u8(ss[0]);
+ s[1] = vget_low_u8(ss[1]);
+ s[2] = vget_low_u8(ss[2]);
+ s[3] = vget_low_u8(ss[3]);
+ s[4] = vget_low_u8(ss[4]);
+ s[5] = vget_low_u8(ss[5]);
+ s[6] = vget_low_u8(ss[6]);
+ s[7] = vget_low_u8(ss[7]);
+ d[0] = scale_filter_8(s, filters);
+
+ s[0] = vget_high_u8(ss[0]);
+ s[1] = vget_high_u8(ss[1]);
+ s[2] = vget_high_u8(ss[2]);
+ s[3] = vget_high_u8(ss[3]);
+ s[4] = vget_high_u8(ss[4]);
+ s[5] = vget_high_u8(ss[5]);
+ s[6] = vget_high_u8(ss[6]);
+ s[7] = vget_high_u8(ss[7]);
+ d[1] = scale_filter_8(s, filters);
+ vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+ src_y += 16;
+ x += 16;
+ } while (x < w);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index d14088ea4..1d3a3cc03 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c
@@ -418,16 +418,16 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
dst1_32_ptr += 2;
dst2_ptr += 2;
load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
- transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+ transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4);
load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
- transpose_s32_4x4(&d1, &d2, &d3, &d4);
+ transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4);
do {
src1_ptr += 4;
src2_ptr += 4;
load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
- transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+ transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8);
load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
- transpose_s32_4x4(&d5, &d6, &d7, &d8);
+ transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8);
q23 = vaddl_s16(s2, s3);
q45 = vaddl_s16(s4, s5);
q67 = vaddl_s16(s6, s7);
@@ -438,7 +438,7 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
q34567 = vaddq_s32(q4567, vmovl_s16(s3));
q45678 = vaddq_s32(q4567, vmovl_s16(s8));
- transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+ transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678);
store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
q45678);
dst1_32_ptr += 4;
@@ -457,7 +457,7 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
r34567 = vaddq_s32(r4567, d3);
r45678 = vaddq_s32(r4567, d8);
- transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+ transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678);
store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
dst2_ptr += 4;
d1 = d5;
@@ -844,9 +844,9 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
w = width;
load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
- transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+ transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4);
load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
- transpose_s32_4x4(&r1, &r2, &r3, &r4);
+ transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4);
src1_ptr += 4;
src2_ptr += 4;
@@ -861,9 +861,9 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
do {
load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
- transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+ transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8);
load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
- transpose_s32_4x4(&r5, &r6, &r7, &r8);
+ transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8);
src1_ptr += 4;
src2_ptr += 4;
@@ -873,7 +873,7 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
q567 = vadd_s16(d7, q56);
q78 = vadd_s16(d7, d8);
q678 = vadd_s16(d6, q78);
- transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+ transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567);
store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
dst1_ptr += 4;
@@ -887,7 +887,7 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
r567 = vaddq_s32(r7, r56);
r78 = vaddq_s32(r7, r8);
r678 = vaddq_s32(r6, r78);
- transpose_s32_4x4(&r234, &r345, &r456, &r567);
+ transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567);
store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
dst2_ptr += 4;
@@ -1449,11 +1449,11 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
return 0;
}
-void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -1591,4 +1591,5 @@ void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
h--;
} while (h > 0);
}
+ return 0;
}
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c
index b4d314802..472315439 100644
--- a/av1/common/arm/warp_plane_neon.c
+++ b/av1/common/arm/warp_plane_neon.c
@@ -9,463 +9,259 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include <assert.h>
-#include <arm_neon.h>
-#include <memory.h>
-#include <math.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-#include "config/av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-#include "av1/common/scale.h"
-
-/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
- * Each coefficient is stored in 8 bits instead of 16 bits
- * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
-
- This is done in order to avoid overflow: Since the tap with the largest
- coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
- order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
- convolve functions.
-
- Instead, we use the summation order
- ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
- The rearrangement of coefficients in this table is so that we can get the
- coefficients into the correct order more quickly.
-*/
-/* clang-format off */
-DECLARE_ALIGNED(8, static const int8_t,
- filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
-#if WARPEDPIXEL_PREC_BITS == 6
- // [-1, 0)
- { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
- { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
- { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
- { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
- { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
- { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
- { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
- { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
- { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
- { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
- { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
- { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
- { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
- { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
- { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
- { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
- { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
- { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
- { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
- { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
- { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
- { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
- { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
- { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
- { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
- { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
- { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
- { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
- { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
- { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
- { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
- { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
- // [0, 1)
- { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
- { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
- { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
- {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
- {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
- {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
- {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
- {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
- {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
- {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
- {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
- {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
- {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
- {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
- {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
- {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
- {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
- {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
- {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
- {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
- {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
- {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
- {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
- {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
- {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
- {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
- {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
- {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
- {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
- {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
- { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
- { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
- // [1, 2)
- { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
- { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
- { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
- { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
- { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
- { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
- { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
- { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
- { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
- { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
- { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
- { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
- { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
- { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
- { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
- { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
- { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
- { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
- { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
- { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
- { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
- { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
- { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
- { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
- { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
- { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
- { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
- { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
- { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
- { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
- { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
- { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
- // dummy (replicate row index 191)
- { 0, 0, 2, -1, 0, 0, 127, 0},
-
-#else
- // [-1, 0)
- { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
- { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
- { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
- { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
- { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
- { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
- { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
- { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
- { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
- { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
- { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
- { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
- { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
- { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
- { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
- { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
- // [0, 1)
- { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
- { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
- {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
- {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
- {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
- {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
- {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
- {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
- {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
- {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
- {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
- {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
- {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
- {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
- {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
- { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
- // [1, 2)
- { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
- { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
- { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
- { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
- { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
- { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
- { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
- { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
- { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
- { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
- { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
- { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
- { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
- { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
- { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
- { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
- // dummy (replicate row index 95)
- { 0, 0, 4, -3, 0, -1, 127, 1},
-#endif // WARPEDPIXEL_PREC_BITS == 6
-};
-/* clang-format on */
-
-static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
- uint8x8_t src_1, int16x4_t *res) {
- int16x8_t coeff_0, coeff_1;
- int16x8_t pix_0, pix_1;
-
- coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
- vreinterpret_s16_s32(x1.val[0]));
- coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
- vreinterpret_s16_s32(x1.val[1]));
-
- pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
- pix_0 = vmulq_s16(coeff_0, pix_0);
-
- pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
- pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
-
- *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+#include "warp_plane_neon.h"
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f[0], in16_lo);
+ int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
}
-static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
- uint8x16_t src_3, uint8x16_t src_4,
- int16x8_t *tmp_dst, int sx, int alpha,
- int k, const int offset_bits_horiz,
- const int reduce_bits_horiz) {
- const uint8x16_t mask = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
- const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
- const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
-
- int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
- int32x2x2_t b0, b1;
- uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
- int32x4_t tmp_res_low, tmp_res_high;
- uint16x8_t res;
- int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
-
- uint8x16_t tmp_0 = vandq_u8(src_1, mask);
- uint8x16_t tmp_1 = vandq_u8(src_2, mask);
- uint8x16_t tmp_2 = vandq_u8(src_3, mask);
- uint8x16_t tmp_3 = vandq_u8(src_4, mask);
-
- tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
- tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
-
- src_1 = vaddq_u8(tmp_0, tmp_2);
- src_2 = vaddq_u8(tmp_1, tmp_3);
-
- src_1_low = vget_low_u8(src_1);
- src_2_low = vget_low_u8(src_2);
- src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
- src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
- src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
- src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
// Loading the 8 filter taps
- f0 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
- f1 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
- f2 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
- f3 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
- f4 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
- f5 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
- f6 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
- f7 = vmovl_s8(
- vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
-
- b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
- vreinterpret_s32_s16(vget_low_s16(f2)));
- b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
- vreinterpret_s32_s16(vget_low_s16(f6)));
- convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
-
- b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
- vreinterpret_s32_s16(vget_low_s16(f3)));
- b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
- vreinterpret_s32_s16(vget_low_s16(f7)));
- convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
-
- b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
- vreinterpret_s32_s16(vget_high_s16(f2)));
- b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
- vreinterpret_s32_s16(vget_high_s16(f6)));
- convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
-
- b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
- vreinterpret_s32_s16(vget_high_s16(f3)));
- b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
- vreinterpret_s32_s16(vget_high_s16(f7)));
- convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
-
- tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
- tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f[0], in16_lo);
+ int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
+ int16x8_t m4 = vmulq_s16(f[4], vextq_s16(in16_lo, in16_hi, 4));
+ int16x8_t m5 = vmulq_s16(f[5], vextq_s16(in16_lo, in16_hi, 5));
+ int16x8_t m6 = vmulq_s16(f[6], vextq_s16(in16_lo, in16_hi, 6));
+ int16x8_t m7 = vmulq_s16(f[7], vextq_s16(in16_lo, in16_hi, 7));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+ int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
+ vpaddlq_s16(m7) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
+ int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
+ int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
+ int16x8_t m4 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 4));
+ int16x8_t m5 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 5));
+ int16x8_t m6 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 6));
+ int16x8_t m7 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 7));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+ int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
+ vpaddlq_s16(m7) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
tmp_res_low = vaddq_s32(tmp_res_low, add_const);
tmp_res_high = vaddq_s32(tmp_res_high, add_const);
- res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
- res = vqrshlq_u16(res, shift);
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy) {
+ int16x4_t s0 = vget_low_s16(src[0]);
+ int16x4_t s1 = vget_low_s16(src[1]);
+ int16x4_t s2 = vget_low_s16(src[2]);
+ int16x4_t s3 = vget_low_s16(src[3]);
+ int16x4_t s4 = vget_low_s16(src[4]);
+ int16x4_t s5 = vget_low_s16(src[5]);
+ int16x4_t s6 = vget_low_s16(src[6]);
+ int16x4_t s7 = vget_low_s16(src[7]);
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+ *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+ vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+ vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+
+ *res = horizontal_add_4d_s32x4(m0123_pairs);
+}
- tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+ *res_low = m0123;
+ *res_high = m4567;
}
-static INLINE void vertical_filter_neon(const int16x8_t *src,
- int32x4_t *res_low, int32x4_t *res_high,
- int sy, int gamma) {
- int16x4_t src_0, src_1, fltr_0, fltr_1;
- int32x4_t res_0, res_1;
- int32x2_t res_0_im, res_1_im;
- int32x4_t res_even, res_odd, im_res_0, im_res_1;
-
- int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
- int16x8x2_t b0, b1, b2, b3;
- int32x4x2_t c0, c1, c2, c3;
- int32x4x2_t d0, d1, d2, d3;
-
- b0 = vtrnq_s16(src[0], src[1]);
- b1 = vtrnq_s16(src[2], src[3]);
- b2 = vtrnq_s16(src[4], src[5]);
- b3 = vtrnq_s16(src[6], src[7]);
-
- c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
- vreinterpretq_s32_s16(b0.val[1]));
- c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
- vreinterpretq_s32_s16(b1.val[1]));
- c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
- vreinterpretq_s32_s16(b2.val[1]));
- c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
- vreinterpretq_s32_s16(b3.val[1]));
-
- f0 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f1 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f2 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f3 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f4 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f5 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f6 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f7 = vld1q_s16((int16_t *)(av1_warped_filter +
- ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
- d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
- d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
- d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
-
- // row:0,1 even_col:0,2
- src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
- fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
- res_0 = vmull_s16(src_0, fltr_0);
-
- // row:0,1,2,3 even_col:0,2
- src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
- fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
- res_0 = vmlal_s16(res_0, src_0, fltr_0);
- res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
- // row:0,1 even_col:4,6
- src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
- fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
- res_1 = vmull_s16(src_1, fltr_1);
-
- // row:0,1,2,3 even_col:4,6
- src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
- fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
- res_1 = vmlal_s16(res_1, src_1, fltr_1);
- res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
- // row:0,1,2,3 even_col:0,2,4,6
- im_res_0 = vcombine_s32(res_0_im, res_1_im);
-
- // row:4,5 even_col:0,2
- src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
- fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
- res_0 = vmull_s16(src_0, fltr_0);
-
- // row:4,5,6,7 even_col:0,2
- src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
- fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
- res_0 = vmlal_s16(res_0, src_0, fltr_0);
- res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
- // row:4,5 even_col:4,6
- src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
- fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
- res_1 = vmull_s16(src_1, fltr_1);
-
- // row:4,5,6,7 even_col:4,6
- src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
- fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
- res_1 = vmlal_s16(res_1, src_1, fltr_1);
- res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
- // row:4,5,6,7 even_col:0,2,4,6
- im_res_1 = vcombine_s32(res_0_im, res_1_im);
-
- // row:0-7 even_col:0,2,4,6
- res_even = vaddq_s32(im_res_0, im_res_1);
-
- // row:0,1 odd_col:1,3
- src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
- fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
- res_0 = vmull_s16(src_0, fltr_0);
-
- // row:0,1,2,3 odd_col:1,3
- src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
- fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
- res_0 = vmlal_s16(res_0, src_0, fltr_0);
- res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
- // row:0,1 odd_col:5,7
- src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
- fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
- res_1 = vmull_s16(src_1, fltr_1);
-
- // row:0,1,2,3 odd_col:5,7
- src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
- fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
- res_1 = vmlal_s16(res_1, src_1, fltr_1);
- res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
- // row:0,1,2,3 odd_col:1,3,5,7
- im_res_0 = vcombine_s32(res_0_im, res_1_im);
-
- // row:4,5 odd_col:1,3
- src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
- fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
- res_0 = vmull_s16(src_0, fltr_0);
-
- // row:4,5,6,7 odd_col:1,3
- src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
- fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
- res_0 = vmlal_s16(res_0, src_0, fltr_0);
- res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
- // row:4,5 odd_col:5,7
- src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
- fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
- res_1 = vmull_s16(src_1, fltr_1);
-
- // row:4,5,6,7 odd_col:5,7
- src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
- fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
- res_1 = vmlal_s16(res_1, src_1, fltr_1);
- res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
- // row:4,5,6,7 odd_col:1,3,5,7
- im_res_1 = vcombine_s32(res_0_im, res_1_im);
-
- // row:0-7 odd_col:1,3,5,7
- res_odd = vaddq_s32(im_res_0, im_res_1);
-
- // reordering as 0 1 2 3 | 4 5 6 7
- c0 = vtrnq_s32(res_even, res_odd);
-
- // Final store
- *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
- *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+ int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+ m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+ int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+ m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+ int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+ m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+ int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+ m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+ int32x4_t m4567_pairs[] = { m4, m5, m6, m7 };
+
+ *res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ *res_high = horizontal_add_4d_s32x4(m4567_pairs);
}
void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
@@ -474,242 +270,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
int subsampling_x, int subsampling_y,
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
- int16x8_t tmp[15];
- const int bd = 8;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
- const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
- const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
-
- int limit = 0;
- uint8x16_t vec_dup, mask_val;
- int32x4_t res_lo, res_hi;
- int16x8_t result_final;
- uint8x16_t src_1, src_2, src_3, src_4;
- static const uint8_t k0To15[16] = { 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15 };
- uint8x16_t indx_vec = vld1q_u8(k0To15);
- uint8x16_t cmp_vec;
-
- const int reduce_bits_horiz = conv_params->round_0;
- const int reduce_bits_vert = conv_params->is_compound
- ? conv_params->round_1
- : 2 * FILTER_BITS - reduce_bits_horiz;
- const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
- const int offset_bits_horiz = bd + FILTER_BITS - 1;
-
- assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-
- const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
- int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int16x4_t res_sub_const =
- vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1))));
- int k;
-
- assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
-
- for (int i = 0; i < p_height; i += 8) {
- for (int j = 0; j < p_width; j += 8) {
- const int32_t src_x = (p_col + j + 4) << subsampling_x;
- const int32_t src_y = (p_row + i + 4) << subsampling_y;
- const int64_t dst_x =
- (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
- const int64_t dst_y =
- (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
- const int64_t x4 = dst_x >> subsampling_x;
- const int64_t y4 = dst_y >> subsampling_y;
-
- int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
- int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
- int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
- sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
- sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- // horizontal
- if (ix4 <= -7) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int16_t dup_val =
- (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
-
- tmp[k + 7] = vdupq_n_s16(dup_val);
- }
- } else if (ix4 >= width + 6) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (FILTER_BITS - reduce_bits_horiz));
- tmp[k + 7] = vdupq_n_s16(dup_val);
- }
- } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
- const int out_of_boundary_left = -(ix4 - 6);
- const int out_of_boundary_right = (ix4 + 8) - width;
-
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- const uint8_t *src = ref + iy * stride + ix4 - 7;
- src_1 = vld1q_u8(src);
-
- if (out_of_boundary_left >= 0) {
- limit = out_of_boundary_left + 1;
- cmp_vec = vdupq_n_u8(out_of_boundary_left);
- vec_dup = vdupq_n_u8(*(src + limit));
- mask_val = vcleq_u8(indx_vec, cmp_vec);
- src_1 = vbslq_u8(mask_val, vec_dup, src_1);
- }
- if (out_of_boundary_right >= 0) {
- limit = 15 - (out_of_boundary_right + 1);
- cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
- vec_dup = vdupq_n_u8(*(src + limit));
- mask_val = vcgeq_u8(indx_vec, cmp_vec);
- src_1 = vbslq_u8(mask_val, vec_dup, src_1);
- }
- src_2 = vextq_u8(src_1, src_1, 1);
- src_3 = vextq_u8(src_2, src_2, 1);
- src_4 = vextq_u8(src_3, src_3, 1);
-
- horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
- offset_bits_horiz, reduce_bits_horiz);
- }
- } else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- const uint8_t *src = ref + iy * stride + ix4 - 7;
- src_1 = vld1q_u8(src);
- src_2 = vextq_u8(src_1, src_1, 1);
- src_3 = vextq_u8(src_2, src_2, 1);
- src_4 = vextq_u8(src_3, src_3, 1);
-
- horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
- offset_bits_horiz, reduce_bits_horiz);
- }
- }
-
- // vertical
- for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + delta * (k + 4);
-
- const int16x8_t *v_src = tmp + (k + 4);
-
- vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
-
- res_lo = vaddq_s32(res_lo, add_const_vert);
- res_hi = vaddq_s32(res_hi, add_const_vert);
-
- if (conv_params->is_compound) {
- uint16_t *const p =
- (uint16_t *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j];
-
- res_lo = vrshlq_s32(res_lo, shift_vert);
- if (conv_params->do_average) {
- uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
- uint16x4_t tmp16_lo = vld1_u16(p);
- int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
- int16x4_t tmp16_low;
- if (conv_params->use_dist_wtd_comp_avg) {
- res_lo = vmulq_s32(res_lo, bwd);
- tmp32_lo = vmulq_s32(tmp32_lo, fwd);
- tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
- tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
- } else {
- tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
- tmp16_low = vshrn_n_s32(tmp32_lo, 1);
- }
- int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
- res_low = vqrshl_s16(res_low, round_bits_vec);
- int16x8_t final_res_low = vcombine_s16(res_low, res_low);
- uint8x8_t res_8_low = vqmovun_s16(final_res_low);
-
- vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
- } else {
- uint16x4_t res_u16_low = vqmovun_s32(res_lo);
- vst1_u16(p, res_u16_low);
- }
- if (p_width > 4) {
- uint16_t *const p4 =
- (uint16_t *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-
- res_hi = vrshlq_s32(res_hi, shift_vert);
- if (conv_params->do_average) {
- uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
-
- uint16x4_t tmp16_hi = vld1_u16(p4);
- int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
- int16x4_t tmp16_high;
- if (conv_params->use_dist_wtd_comp_avg) {
- res_hi = vmulq_s32(res_hi, bwd);
- tmp32_hi = vmulq_s32(tmp32_hi, fwd);
- tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
- tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
- } else {
- tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
- tmp16_high = vshrn_n_s32(tmp32_hi, 1);
- }
- int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
- res_high = vqrshl_s16(res_high, round_bits_vec);
- int16x8_t final_res_high = vcombine_s16(res_high, res_high);
- uint8x8_t res_8_high = vqmovun_s16(final_res_high);
-
- vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
- 0);
- } else {
- uint16x4_t res_u16_high = vqmovun_s32(res_hi);
- vst1_u16(p4, res_u16_high);
- }
- }
- } else {
- res_lo = vrshlq_s32(res_lo, shift_vert);
- res_hi = vrshlq_s32(res_hi, shift_vert);
-
- result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
- result_final = vsubq_s16(result_final, sub_constant);
-
- uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
- uint8x8_t val = vqmovun_s16(result_final);
-
- if (p_width == 4) {
- vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
- } else {
- vst1_u8(p, val);
- }
- }
- }
- }
- }
+ av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, conv_params, alpha, beta, gamma, delta);
}
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h
new file mode 100644
index 000000000..de5e3bde2
--- /dev/null
+++ b/av1/common/arm/warp_plane_neon.h
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
+#define AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha);
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha);
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx);
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx);
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy);
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma);
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy);
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma);
+
+static INLINE void load_filters_4(int16x8_t out[], int offset, int stride) {
+ out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+}
+
+static INLINE void load_filters_8(int16x8_t out[], int offset, int stride) {
+ out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+}
+
+static INLINE int clamp_iy(int iy, int height) {
+ return clamp(iy, 0, height - 1);
+}
+
+static INLINE void warp_affine_horizontal(
+ const uint8_t *ref, int width, int height, int stride, int p_width,
+ int p_height, int16_t alpha, int16_t beta, const int64_t x4,
+ const int64_t y4, const int i, int16x8_t tmp[], const uint8x16_t indx_vec) {
+ const int bd = 8;
+ const int reduce_bits_horiz = ROUND0_BITS;
+ const int height_limit = AOMMIN(8, p_height - i) + 7;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ if (ix4 <= -7) {
+ for (int k = 0; k < height_limit; ++k) {
+ int iy = clamp_iy(iy4 + k - 7, height);
+ int16_t dup_val =
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ } else if (ix4 >= width + 6) {
+ for (int k = 0; k < height_limit; ++k) {
+ int iy = clamp_iy(iy4 + k - 7, height);
+ int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ }
+
+ uint8x16_t in[15];
+ if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (int k = 0; k < height_limit; ++k) {
+ const int iy = clamp_iy(iy4 + k - 7, height);
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ uint8x16_t src_1 = vld1q_u8(src);
+
+ if (out_of_boundary_left >= 0) {
+ int limit = out_of_boundary_left + 1;
+ uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left);
+ uint8x16_t vec_dup = vdupq_n_u8(*(src + limit));
+ uint8x16_t mask_val = vcleq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ if (out_of_boundary_right >= 0) {
+ int limit = 15 - (out_of_boundary_right + 1);
+ uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+ uint8x16_t vec_dup = vdupq_n_u8(*(src + limit));
+ uint8x16_t mask_val = vcgeq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ in[k] = src_1;
+ }
+ } else {
+ for (int k = 0; k < height_limit; ++k) {
+ const int iy = clamp_iy(iy4 + k - 7, height);
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ in[k] = vld1q_u8(src);
+ }
+ }
+
+ if (p_width == 4) {
+ if (beta == 0) {
+ if (alpha == 0) {
+ for (int k = 0; k < height_limit; ++k) {
+ tmp[k] = horizontal_filter_4x1_f1(in[k], sx4);
+ }
+ } else {
+ for (int k = 0; k < height_limit; ++k) {
+ tmp[k] = horizontal_filter_4x1_f4(in[k], sx4, alpha);
+ }
+ }
+ } else {
+ if (alpha == 0) {
+ for (int k = 0; k < height_limit; ++k) {
+ const int sx = sx4 + beta * (k - 3);
+ tmp[k] = horizontal_filter_4x1_f1(in[k], sx);
+ }
+ } else {
+ for (int k = 0; k < height_limit; ++k) {
+ const int sx = sx4 + beta * (k - 3);
+ tmp[k] = horizontal_filter_4x1_f4(in[k], sx, alpha);
+ }
+ }
+ }
+ } else {
+ if (beta == 0) {
+ if (alpha == 0) {
+ for (int k = 0; k < height_limit; ++k) {
+ tmp[k] = horizontal_filter_8x1_f1(in[k], sx4);
+ }
+ } else {
+ for (int k = 0; k < height_limit; ++k) {
+ tmp[k] = horizontal_filter_8x1_f8(in[k], sx4, alpha);
+ }
+ }
+ } else {
+ if (alpha == 0) {
+ for (int k = 0; k < height_limit; ++k) {
+ const int sx = sx4 + beta * (k - 3);
+ tmp[k] = horizontal_filter_8x1_f1(in[k], sx);
+ }
+ } else {
+ for (int k = 0; k < height_limit; ++k) {
+ const int sx = sx4 + beta * (k - 3);
+ tmp[k] = horizontal_filter_8x1_f8(in[k], sx, alpha);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void warp_affine_vertical(
+ uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound,
+ uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg,
+ int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j,
+ int16x8_t tmp[], const int fwd, const int bwd) {
+ const int bd = 8;
+ const int reduce_bits_horiz = ROUND0_BITS;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ int add_const_vert;
+ if (is_compound) {
+ add_const_vert =
+ (1 << offset_bits_vert) + (1 << (COMPOUND_ROUND1_BITS - 1));
+ } else {
+ add_const_vert =
+ (1 << offset_bits_vert) + (1 << (2 * FILTER_BITS - ROUND0_BITS - 1));
+ }
+ const int sub_constant = (1 << (bd - 1)) + (1 << bd);
+
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int res_sub_const =
+ (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1)) -
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS)) -
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ if (p_width > 4) {
+ for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ const int16x8_t *v_src = tmp + (k + 4);
+
+ int32x4_t res_lo, res_hi;
+ if (gamma == 0) {
+ vertical_filter_8x1_f1(v_src, &res_lo, &res_hi, sy);
+ } else {
+ vertical_filter_8x1_f8(v_src, &res_lo, &res_hi, sy, gamma);
+ }
+
+ res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert));
+ res_hi = vaddq_s32(res_hi, vdupq_n_s32(add_const_vert));
+
+ if (is_compound) {
+ uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j];
+ int16x8_t res_s16 =
+ vcombine_s16(vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS),
+ vshrn_n_s32(res_hi, COMPOUND_ROUND1_BITS));
+ if (do_average) {
+ int16x8_t tmp16 = vreinterpretq_s16_u16(vld1q_u16(p));
+ if (use_dist_wtd_comp_avg) {
+ int32x4_t tmp32_lo = vmull_n_s16(vget_low_s16(tmp16), fwd);
+ int32x4_t tmp32_hi = vmull_n_s16(vget_high_s16(tmp16), fwd);
+ tmp32_lo = vmlal_n_s16(tmp32_lo, vget_low_s16(res_s16), bwd);
+ tmp32_hi = vmlal_n_s16(tmp32_hi, vget_high_s16(res_s16), bwd);
+ tmp16 = vcombine_s16(vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS),
+ vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS));
+ } else {
+ tmp16 = vhaddq_s16(tmp16, res_s16);
+ }
+ int16x8_t res = vaddq_s16(tmp16, vdupq_n_s16(res_sub_const));
+ uint8x8_t res8 = vqshrun_n_s16(
+ res, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+ vst1_u8(&pred[(i + k + 4) * p_stride + j], res8);
+ } else {
+ vst1q_u16(p, vreinterpretq_u16_s16(res_s16));
+ }
+ } else {
+ int16x8_t res16 =
+ vcombine_s16(vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS),
+ vshrn_n_s32(res_hi, 2 * FILTER_BITS - ROUND0_BITS));
+ res16 = vsubq_s16(res16, vdupq_n_s16(sub_constant));
+
+ uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+ vst1_u8(p, vqmovun_s16(res16));
+ }
+ }
+ } else {
+ // p_width == 4
+ for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ const int16x8_t *v_src = tmp + (k + 4);
+
+ int32x4_t res_lo;
+ if (gamma == 0) {
+ vertical_filter_4x1_f1(v_src, &res_lo, sy);
+ } else {
+ vertical_filter_4x1_f4(v_src, &res_lo, sy, gamma);
+ }
+
+ res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert));
+
+ if (is_compound) {
+ uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j];
+
+ int16x4_t res_lo_s16 = vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS);
+ if (do_average) {
+ uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+ int16x4_t tmp16_lo = vreinterpret_s16_u16(vld1_u16(p));
+ if (use_dist_wtd_comp_avg) {
+ int32x4_t tmp32_lo = vmull_n_s16(tmp16_lo, fwd);
+ tmp32_lo = vmlal_n_s16(tmp32_lo, res_lo_s16, bwd);
+ tmp16_lo = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+ } else {
+ tmp16_lo = vhadd_s16(tmp16_lo, res_lo_s16);
+ }
+ int16x4_t res = vadd_s16(tmp16_lo, vdup_n_s16(res_sub_const));
+ uint8x8_t res8 = vqshrun_n_s16(
+ vcombine_s16(res, vdup_n_s16(0)),
+ 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+ vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res8), 0);
+ } else {
+ uint16x4_t res_u16_low = vreinterpret_u16_s16(res_lo_s16);
+ vst1_u16(p, res_u16_low);
+ }
+ } else {
+ int16x4_t res16 = vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS);
+ res16 = vsub_s16(res16, vdup_n_s16(sub_constant));
+
+ uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+ uint8x8_t val = vqmovun_s16(vcombine_s16(res16, vdup_n_s16(0)));
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+ }
+ }
+ }
+}
+
+static INLINE void av1_warp_affine_common(
+ const int32_t *mat, const uint8_t *ref, int width, int height, int stride,
+ uint8_t *pred, int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const int is_compound = conv_params->is_compound;
+ uint16_t *const dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ static const uint8_t k0To15[16] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 };
+ const uint8x16_t indx_vec = vld1q_u8(k0To15);
+
+ assert(IMPLIES(is_compound, dst != NULL));
+ assert(IMPLIES(do_average, is_compound));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int16x8_t tmp[15];
+ warp_affine_horizontal(ref, width, height, stride, p_width, p_height,
+ alpha, beta, x4, y4, i, tmp, indx_vec);
+ warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst,
+ dst_stride, do_average, use_dist_wtd_comp_avg, gamma,
+ delta, y4, i, j, tmp, w0, w1);
+ }
+ }
+}
+
+#endif // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c
new file mode 100644
index 000000000..39e3ad99f
--- /dev/null
+++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "warp_plane_neon.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+ int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5]));
+ int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+ uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4));
+ uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5));
+ uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6));
+ uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+ int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8);
+ int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+ int32x4_t tmp_res_high = vpaddq_s32(m45, m67);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+ uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+ uint8x16_t in_89ab = vqtbl1q_u8(in, perm2);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0);
+ m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+ int32x4_t tmp_res_high = m4567;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy) {
+ int16x4_t s0 = vget_low_s16(src[0]);
+ int16x4_t s1 = vget_low_s16(src[1]);
+ int16x4_t s2 = vget_low_s16(src[2]);
+ int16x4_t s3 = vget_low_s16(src[3]);
+ int16x4_t s4 = vget_low_s16(src[4]);
+ int16x4_t s5 = vget_low_s16(src[5]);
+ int16x4_t s6 = vget_low_s16(src[6]);
+ int16x4_t s7 = vget_low_s16(src[7]);
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+ *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+ vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+ vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+
+ *res = horizontal_add_4d_s32x4(m0123_pairs);
+}
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+ *res_low = m0123;
+ *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+ int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+ m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+ int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+ m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+ int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+ m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+ int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+ m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+ int32x4_t m4567_pairs[] = { m4, m5, m6, m7 };
+
+ *res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ *res_high = horizontal_add_4d_s32x4(m4567_pairs);
+}
+
+void av1_warp_affine_neon_i8mm(const int32_t *mat, const uint8_t *ref,
+ int width, int height, int stride, uint8_t *pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, conv_params, alpha, beta, gamma, delta);
+}
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c
new file mode 100644
index 000000000..2a48c5ead
--- /dev/null
+++ b/av1/common/arm/warp_plane_sve.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "warp_plane_neon.h"
+
+#include <arm_neon_sve_bridge.h>
+
+DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+ // The 16-bit dot product instructions only exist in SVE and not Neon.
+ // We can get away without rewriting the existing Neon code by making use of
+ // the Neon-SVE bridge intrinsics to reinterpret a Neon vector as a SVE
+ // vector with the high part of the vector being "don't care", and then
+ // operating on that instead.
+ // This is clearly suboptimal in machines with a SVE vector length above
+ // 128-bits as the remainder of the vector is wasted, however this appears to
+ // still be beneficial compared to not using the instruction.
+ return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+ svset_neonq_s16(svundef_s16(), x),
+ svset_neonq_s16(svundef_s16(), y)));
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+ int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5]));
+ int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+ uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4));
+ uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5));
+ uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6));
+ uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+ int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8);
+ int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+ int32x4_t tmp_res_high = vpaddq_s32(m45, m67);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+ uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+ uint8x16_t in_89ab = vqtbl1q_u8(in, perm2);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0);
+ m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+ int32x4_t tmp_res_high = m4567;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy) {
+ int16x4_t s0 = vget_low_s16(src[0]);
+ int16x4_t s1 = vget_low_s16(src[1]);
+ int16x4_t s2 = vget_low_s16(src[2]);
+ int16x4_t s3 = vget_low_s16(src[3]);
+ int16x4_t s4 = vget_low_s16(src[4]);
+ int16x4_t s5 = vget_low_s16(src[5]);
+ int16x4_t s6 = vget_low_s16(src[6]);
+ int16x4_t s7 = vget_low_s16(src[7]);
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+ *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+ vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+ vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+ int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+ int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+ int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+
+ int64x2_t m01 = vpaddq_s64(m0, m1);
+ int64x2_t m23 = vpaddq_s64(m2, m3);
+
+ *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+}
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+ *res_low = m0123;
+ *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+ int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+ int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+ int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+ int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]);
+ int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]);
+ int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]);
+ int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]);
+
+ int64x2_t m01 = vpaddq_s64(m0, m1);
+ int64x2_t m23 = vpaddq_s64(m2, m3);
+ int64x2_t m45 = vpaddq_s64(m4, m5);
+ int64x2_t m67 = vpaddq_s64(m6, m7);
+
+ *res_low = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+ *res_high = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+}
+
+void av1_warp_affine_sve(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, conv_params, alpha, beta, gamma, delta);
+}
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index d7f511d48..6440c16ad 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -15,318 +15,334 @@
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
#include "aom_ports/mem.h"
#include "av1/common/common.h"
-#include "av1/common/arm/convolve_neon.h"
-
-#define HORZ_FILTERING_CORE(t0, t1, t2, t3, t4, t5, t6, res) \
- res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t1)); \
- res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t3)); \
- res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t5)); \
- res3 = vreinterpretq_s16_u16(vmovl_u8(t6)); \
- res = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, bd, \
- conv_params->round_0);
-
-#define PROCESS_ROW_FOR_VERTICAL_FILTER \
- __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); \
- \
- do { \
- s7 = vld1q_s16(s); \
- s += src_stride; \
- \
- t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, \
- bd, conv_params->round_1); \
- vst1_u8(d, t0); \
- d += dst_stride; \
- \
- s0 = s1; \
- s1 = s2; \
- s2 = s3; \
- s3 = s4; \
- s4 = s5; \
- s5 = s6; \
- s6 = s7; \
- height--; \
- } while (height > 0);
-
-static INLINE void process_row_for_horz_filtering(
- uint16_t *dst_ptr, int16_t *filter_x, const uint8_t *src_ptr,
- ptrdiff_t src_stride, ptrdiff_t dst_stride, int round0_bits, int w,
- int height, int bd) {
+#include "av1/common/restoration.h"
+
+static INLINE uint16x8_t wiener_convolve5_8_2d_h(
+ const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
+ const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter,
+ const int32x4_t round_vec, const uint16x8_t im_max_val) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 2) add
+ // mirrored source elements before multiplying filter coefficients.
+ int16x8_t s04 = vreinterpretq_s16_u16(vaddl_u8(t0, t4));
+ int16x8_t s13 = vreinterpretq_s16_u16(vaddl_u8(t1, t3));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+ // x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.)
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3);
+
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
+ vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
+
+ return vminq_u16(res, im_max_val);
+}
+
+static INLINE void convolve_add_src_horiz_5tap_neon(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+ const int32x4_t round_vec, const uint16x8_t im_max_val) {
do {
- __builtin_prefetch(src_ptr);
+ const uint8_t *s = src_ptr;
+ uint16_t *d = dst_ptr;
+ int width = w;
- uint8x8_t tt0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ do {
+ uint8x8_t s0, s1, s2, s3, s4;
+ load_u8_8x5(s, 1, &s0, &s1, &s2, &s3, &s4);
+
+ uint16x8_t d0 = wiener_convolve5_8_2d_h(s0, s1, s2, s3, s4, x_filter,
+ round_vec, im_max_val);
- __builtin_prefetch(dst_ptr);
+ vst1q_u16(d, d0);
- const uint8_t *ss = src_ptr + 8;
- uint16_t *d_tmp = dst_ptr;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+}
+
+static INLINE uint16x8_t wiener_convolve7_8_2d_h(
+ const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
+ const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5,
+ const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec,
+ const uint16x8_t im_max_val) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int16x8_t s06 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+ int16x8_t s15 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ int16x8_t s24 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3);
+
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
+ vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
+
+ return vminq_u16(res, im_max_val);
+}
+
+static INLINE void convolve_add_src_horiz_7tap_neon(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+ const int32x4_t round_vec, const uint16x8_t im_max_val) {
+ do {
+ const uint8_t *s = src_ptr;
+ uint16_t *d = dst_ptr;
int width = w;
do {
- uint8x8_t tt7 = vld1_u8(ss); // a8 a9 a10 a11 a12 a13 a14 a15
- uint8x8_t ttemp_0 = tt0;
- tt0 = tt7;
-
- uint8x8_t tt1 = vext_u8(ttemp_0, tt7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- uint8x8_t tt2 = vext_u8(ttemp_0, tt7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- uint8x8_t tt3 = vext_u8(ttemp_0, tt7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- uint8x8_t tt4 = vext_u8(ttemp_0, tt7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- uint8x8_t tt5 = vext_u8(ttemp_0, tt7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- uint8x8_t tt6 = vext_u8(ttemp_0, tt7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- tt7 = vext_u8(ttemp_0, tt7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
-
- int16x8_t ttt0 = vreinterpretq_s16_u16(vaddl_u8(ttemp_0, tt6));
- int16x8_t ttt1 = vreinterpretq_s16_u16(vaddl_u8(tt1, tt5));
- int16x8_t ttt2 = vreinterpretq_s16_u16(vaddl_u8(tt2, tt4));
- int16x8_t ttt3 = vreinterpretq_s16_u16(vmovl_u8(tt3));
- uint16x8_t dd0 = wiener_convolve8_horiz_8x8(ttt0, ttt1, ttt2, ttt3,
- filter_x, bd, round0_bits);
-
- vst1q_u16(d_tmp, dd0);
-
- ss += 8;
- d_tmp += 8;
- width -= 8;
- } while (width > 0);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_u8_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ uint16x8_t d0 = wiener_convolve7_8_2d_h(s0, s1, s2, s3, s4, s5, s6,
+ x_filter, round_vec, im_max_val);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
src_ptr += src_stride;
dst_ptr += dst_stride;
- height--;
- } while (height > 0);
+ } while (--h != 0);
}
-/* Wiener filter 2D
- Apply horizontal filter and store in a temporary buffer. When applying
- vertical filter, overwrite the original pixel values.
-*/
-void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h,
- const ConvolveParams *conv_params) {
- uint8_t *d;
- const uint8_t *src_ptr, *s_tmp;
- uint16_t *dst_ptr;
- (void)x_step_q4;
- (void)y_step_q4;
+static INLINE uint8x8_t wiener_convolve5_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter,
+ const int32x4_t round_vec) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 2) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int16x8_t s04 = vaddq_s16(s0, s4);
+ int16x8_t s13 = vaddq_s16(s1, s3);
- int height;
- const int bd = 8;
- // Indicates the height needs to be processed during horizontal filtering.
- const int intermediate_height = h + SUBPEL_TAPS - 1;
- const int center_tap = ((SUBPEL_TAPS - 1) / 2);
- int16_t filter_x_tmp[7], filter_y_tmp[7];
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), y_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), y_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), y_filter, 3);
- DECLARE_ALIGNED(16, uint16_t,
- temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), y_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), y_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), y_filter, 3);
- assert(x_step_q4 == 16 && y_step_q4 == 16);
- assert(!(w % 8));
-
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
-
- assert(filter_x[7] == 0);
- assert(filter_y[7] == 0);
-
- /* assumption of horizontal filtering output will not exceed 15 bit.
- ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
- 16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
- */
- assert((conv_params->round_0) >= 1);
-
- memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
- memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
-
- filter_x_tmp[3] += (1 << FILTER_BITS);
- filter_y_tmp[3] += (1 << FILTER_BITS);
-
- s_tmp = src - center_tap * src_stride - center_tap;
- dst_ptr = temp;
- src_ptr = s_tmp;
- height = intermediate_height;
-
- // For aarch_64.
-#if AOM_ARCH_AARCH64
- int processed_height = 0;
- uint16_t *d_tmp;
- int width, remaining_height;
- // Start of horizontal filtering.
- if (intermediate_height > 7) {
- uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
- do {
- const uint8_t *s;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
-
- load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s = src_ptr + 7;
- d_tmp = dst_ptr;
- width = w;
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
- __builtin_prefetch(dst_ptr + 4 * dst_stride);
- __builtin_prefetch(dst_ptr + 5 * dst_stride);
- __builtin_prefetch(dst_ptr + 6 * dst_stride);
- __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
- do {
- int16x8_t res0, res1, res2, res3;
- uint8x8_t t8, t9, t10, t11, t12, t13, t14;
- load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
- transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
-
- HORZ_FILTERING_CORE(t0, t6, t1, t5, t2, t4, t3, res4)
- HORZ_FILTERING_CORE(t1, t7, t2, t6, t3, t5, t4, res5)
- HORZ_FILTERING_CORE(t2, t8, t3, t7, t4, t6, t5, res6)
- HORZ_FILTERING_CORE(t3, t9, t4, t8, t5, t7, t6, res7)
- HORZ_FILTERING_CORE(t4, t10, t5, t9, t6, t8, t7, res8)
- HORZ_FILTERING_CORE(t5, t11, t6, t10, t7, t9, t8, res9)
- HORZ_FILTERING_CORE(t6, t12, t7, t11, t8, t10, t9, res10)
- HORZ_FILTERING_CORE(t7, t13, t8, t12, t9, t11, t10, res11)
-
- transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
- &res11);
- store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
- res10, res11);
-
- t0 = t8;
- t1 = t9;
- t2 = t10;
- t3 = t11;
- t4 = t12;
- t5 = t13;
- t6 = t14;
- s += 8;
- d_tmp += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += 8 * src_stride;
- dst_ptr += 8 * MAX_SB_SIZE;
- height -= 8;
- processed_height += 8;
- } while (height > 7);
- }
+ int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+ int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
- // Process the remaining rows for horizontal filtering.
- remaining_height = intermediate_height - processed_height;
- if (remaining_height)
- process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
- MAX_SB_SIZE, conv_params->round_0, w, height,
- bd);
-
- // Start of vertical filtering.
- {
- int16_t *src_tmp_ptr, *s;
- uint8_t *dst_tmp_ptr;
- height = h;
- width = w;
- src_tmp_ptr = (int16_t *)temp;
- dst_tmp_ptr = dst;
- src_stride = MAX_SB_SIZE;
+ return vqmovun_s16(vcombine_s16(res_lo, res_hi));
+}
- do {
+static INLINE void convolve_add_src_vert_5tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
+ const int32x4_t round_vec) {
+ do {
+ const int16_t *s = (int16_t *)src;
+ uint8_t *d = dst;
+ int height = h;
+
+ while (height > 3) {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint8x8_t t0;
- s = src_tmp_ptr;
- d = dst_tmp_ptr;
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ uint8x8_t d0 =
+ wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
+ uint8x8_t d1 =
+ wiener_convolve5_8_2d_v(s1, s2, s3, s4, s5, y_filter, round_vec);
+ uint8x8_t d2 =
+ wiener_convolve5_8_2d_v(s2, s3, s4, s5, s6, y_filter, round_vec);
+ uint8x8_t d3 =
+ wiener_convolve5_8_2d_v(s3, s4, s5, s6, s7, y_filter, round_vec);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ }
+
+ while (height-- != 0) {
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+
+ uint8x8_t d0 =
+ wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
+
+ vst1_u8(d, d0);
+
+ d += dst_stride;
+ s += src_stride;
+ }
+
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+static INLINE uint8x8_t wiener_convolve7_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int16x8_t s06 = vaddq_s16(s0, s6);
+ int16x8_t s15 = vaddq_s16(s1, s5);
+ int16x8_t s24 = vaddq_s16(s2, s4);
+
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), y_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), y_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), y_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), y_filter, 3);
+
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), y_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), y_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), y_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), y_filter, 3);
+
+ int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+ int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+
+ return vqmovun_s16(vcombine_s16(res_lo, res_hi));
+}
+
+static INLINE void convolve_add_src_vert_7tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
+ const int32x4_t round_vec) {
+ do {
+ const int16_t *s = (int16_t *)src;
+ uint8_t *d = dst;
+ int height = h;
+
+ while (height > 3) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
+ load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9);
+
+ uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
+ y_filter, round_vec);
+ uint8x8_t d1 = wiener_convolve7_8_2d_v(s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_vec);
+ uint8x8_t d2 = wiener_convolve7_8_2d_v(s2, s3, s4, s5, s6, s7, s8,
+ y_filter, round_vec);
+ uint8x8_t d3 = wiener_convolve7_8_2d_v(s3, s4, s5, s6, s7, s8, s9,
+ y_filter, round_vec);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ }
+
+ while (height-- != 0) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- height = h;
-
- do {
- int16x8_t s8, s9, s10;
- uint8x8_t t1, t2, t3;
- __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
-
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
- bd, conv_params->round_1);
- t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
- bd, conv_params->round_1);
- t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
- bd, conv_params->round_1);
- t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
- bd, conv_params->round_1);
-
- store_u8_8x4(d, dst_stride, t0, t1, t2, t3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height > 3);
-
- if (height) {
- PROCESS_ROW_FOR_VERTICAL_FILTER
- }
- src_tmp_ptr += 8;
- dst_tmp_ptr += 8;
- w -= 8;
- } while (w > 0);
+
+ uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
+ y_filter, round_vec);
+
+ vst1_u8(d, d0);
+
+ d += dst_stride;
+ s += src_stride;
+ }
+
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+ assert(filter[7] == 0);
+ if (filter[0] == 0 && filter[6] == 0) {
+ return WIENER_WIN_REDUCED;
}
-#else
- // Start of horizontal filtering.
- process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
- MAX_SB_SIZE, conv_params->round_0, w, height,
- bd);
-
- // Start of vertical filtering.
- {
- int16_t *src_tmp_ptr, *s;
- uint8_t *dst_tmp_ptr;
- src_tmp_ptr = (int16_t *)temp;
- dst_tmp_ptr = dst;
- src_stride = MAX_SB_SIZE;
+ return WIENER_WIN;
+}
- do {
- uint8x8_t t0;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- s = src_tmp_ptr;
- d = dst_tmp_ptr;
+// Wiener filter 2D
+// Apply horizontal filter and store in a temporary buffer. When applying
+// vertical filter, overwrite the original pixel values.
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *x_filter, int x_step_q4,
+ const int16_t *y_filter, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params) {
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)conv_params;
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
+ assert(w % 8 == 0);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(x_filter[7] == 0 && y_filter[7] == 0);
+ // For bd == 8, assert horizontal filtering output will not exceed 15-bit:
+ assert(8 + 1 + FILTER_BITS - conv_params->round_0 <= 15);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+
+ const int x_filter_taps = get_wiener_filter_taps(x_filter);
+ const int y_filter_taps = get_wiener_filter_taps(y_filter);
+ int16x4_t x_filter_s16 = vld1_s16(x_filter);
+ int16x4_t y_filter_s16 = vld1_s16(y_filter);
+ // Add 128 to tap 3. (Needed for rounding.)
+ x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+ y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
- height = h;
- PROCESS_ROW_FOR_VERTICAL_FILTER
+ const int im_stride = MAX_SB_SIZE;
+ const int im_h = h + y_filter_taps - 1;
+ const int horiz_offset = x_filter_taps / 2;
+ const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
- src_tmp_ptr += 8;
- dst_tmp_ptr += 8;
+ const int bd = 8;
+ const uint16x8_t im_max_val =
+ vdupq_n_u16((1 << (bd + 1 + FILTER_BITS - WIENER_ROUND0_BITS)) - 1);
+ const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+
+ const int32x4_t vert_round_vec =
+ vdupq_n_s32((1 << (2 * FILTER_BITS - WIENER_ROUND0_BITS - 1)) -
+ (1 << (bd + (2 * FILTER_BITS - WIENER_ROUND0_BITS) - 1)));
+
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ convolve_add_src_horiz_5tap_neon(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w, im_h,
+ x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ convolve_add_src_horiz_7tap_neon(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w, im_h,
+ x_filter_s16, horiz_round_vec, im_max_val);
+ }
- w -= 8;
- } while (w > 0);
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ convolve_add_src_vert_5tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_s16, vert_round_vec);
+ } else {
+ convolve_add_src_vert_7tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_s16, vert_round_vec);
}
-#endif
}
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 78443c798..c9880cf5d 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -14,6 +14,8 @@
#include "config/aom_config.h"
+#include "aom/internal/aom_codec_internal.h"
+
#include "aom_ports/mem.h"
#include "av1/common/blockd.h"
#include "av1/common/seg_common.h"
@@ -87,6 +89,7 @@ typedef struct LoopFilterWorkerData {
AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
TX_SIZE tx_buf[MAX_MIB_SIZE];
+ struct aom_internal_error_info error_info;
} LFWorkerData;
/*!\endcond */
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 17dcc49a4..38e1da9a1 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -94,11 +94,11 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
specialize qw/av1_highbd_convolve_horiz_rs sse4_1 neon/;
- add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
- specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/;
+ add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd";
+ specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2 neon/;
}
-add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
+add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params";
specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
# directional intra predictor functions
@@ -255,11 +255,11 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
# build compound seg mask functions
add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
-specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
+specialize qw/av1_build_compound_diffwtd_mask neon sse4_1 avx2/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
- specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
+ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2 neon/;
}
add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
@@ -296,24 +296,24 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
- specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
+ specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
- specialize qw/aom_highbd_upsampled_pred sse2/;
+ specialize qw/aom_highbd_upsampled_pred sse2 neon/;
add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
- specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+ specialize qw/aom_highbd_comp_avg_upsampled_pred sse2 neon/;
add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
- specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
+ specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2 neon/;
}
# the transform coefficients are held in 32-bit
@@ -396,15 +396,16 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
#
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
- specialize qw/av1_apply_temporal_filter sse2 avx2 neon/;
+ specialize qw/av1_apply_temporal_filter sse2 avx2 neon neon_dotprod/;
add_proto qw/double av1_estimate_noise_from_single_plane/, "const uint8_t *src, int height, int width, int stride, int edge_thresh";
- specialize qw/av1_estimate_noise_from_single_plane avx2/;
+ specialize qw/av1_estimate_noise_from_single_plane avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
- specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
+ specialize qw/av1_highbd_apply_temporal_filter sse2 avx2 neon/;
add_proto qw/double av1_highbd_estimate_noise_from_single_plane/, "const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh";
+ specialize qw/av1_highbd_estimate_noise_from_single_plane neon/;
}
}
@@ -419,7 +420,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# ENCODEMB INVOKE
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/av1_highbd_block_error sse2 avx2/;
+ specialize qw/av1_highbd_block_error sse2 avx2 neon/;
}
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -438,9 +439,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon/;
add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
- specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
+ specialize qw/av1_wedge_sign_from_residuals sse2 avx2 neon/;
add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
- specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
+ specialize qw/av1_wedge_compute_delta_squares sse2 avx2 neon/;
# hash
add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
@@ -448,43 +449,43 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
- specialize qw/av1_compute_stats sse4_1 avx2/;
- add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
- specialize qw/av1_calc_proj_params sse4_1 avx2/;
- add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ specialize qw/av1_compute_stats sse4_1 avx2 neon/;
+ add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+ specialize qw/av1_calc_proj_params sse4_1 avx2 neon/;
+ add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void av1_calc_proj_params_high_bd/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
- specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2/;
- add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ add_proto qw/void av1_calc_proj_params_high_bd/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+ specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/;
+ add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
- specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+ specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
}
}
- add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
+ add_proto qw/void av1_get_horver_correlation_full/, "const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
- add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
+ add_proto qw/void av1_nn_predict/, "const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
- add_proto qw/void av1_nn_fast_softmax_16/, " const float *input_nodes, float *output";
+ add_proto qw/void av1_nn_fast_softmax_16/, "const float *input_nodes, float *output";
if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
- specialize qw/av1_nn_predict sse3 neon/;
+ specialize qw/av1_nn_predict sse3 avx2 neon/;
specialize qw/av1_nn_fast_softmax_16 sse3/;
}
# CNN functions
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
- add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
- add_proto qw/bool av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
- add_proto qw/void av1_cnn_convolve_no_maxpool_padding_valid/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step";
+ add_proto qw/void av1_cnn_activate/, "float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+ add_proto qw/void av1_cnn_add/, "float **input, int channels, int width, int height, int stride, const float **add";
+ add_proto qw/bool av1_cnn_predict/, "const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+ add_proto qw/void av1_cnn_convolve_no_maxpool_padding_valid/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step";
if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
specialize qw/av1_cnn_convolve_no_maxpool_padding_valid avx2/;
}
- add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+ add_proto qw/void av1_cnn_deconvolve/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
}
@@ -540,17 +541,14 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
# WARPED_MOTION / GLOBAL_MOTION functions
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
- specialize qw/av1_highbd_warp_affine sse4_1 avx2/;
+ specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/;
}
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 avx2 neon/;
-
-add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
-specialize qw/av1_calc_frame_error sse2 avx2/;
+specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
# LOOP_RESTORATION functions
-add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+add_proto qw/int av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
@@ -561,16 +559,22 @@ specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
# CONVOLVE_ROUND/COMPOUND_ROUND functions
add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
+add_proto qw/void av1_convolve_y_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params";
add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params";
if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_2d_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_x_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
+ add_proto qw/void av1_highbd_convolve_y_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
@@ -580,13 +584,16 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
- specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
- specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
+ specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+ specialize qw/av1_convolve_2d_sr_intrabc neon/;
+ specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+ specialize qw/av1_convolve_x_sr_intrabc neon/;
specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+ specialize qw/av1_convolve_y_sr_intrabc neon/;
specialize qw/av1_convolve_2d_scale sse4_1/;
- specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon neon_dotprod neon_i8mm/;
specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
- specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/;
@@ -594,21 +601,26 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon/;
specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/;
specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon/;
+ specialize qw/av1_highbd_convolve_2d_sr_intrabc neon/;
specialize qw/av1_highbd_convolve_x_sr ssse3 avx2 neon/;
+ specialize qw/av1_highbd_convolve_x_sr_intrabc neon/;
specialize qw/av1_highbd_convolve_y_sr ssse3 avx2 neon/;
+ specialize qw/av1_highbd_convolve_y_sr_intrabc neon/;
specialize qw/av1_highbd_convolve_2d_scale sse4_1 neon/;
}
# INTRA_EDGE functions
add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
-specialize qw/av1_filter_intra_edge sse4_1/;
+specialize qw/av1_filter_intra_edge sse4_1 neon/;
add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
-specialize qw/av1_upsample_intra_edge sse4_1/;
+specialize qw/av1_upsample_intra_edge sse4_1 neon/;
-add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
-specialize qw/av1_filter_intra_edge_high sse4_1/;
-add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
-specialize qw/av1_upsample_intra_edge_high sse4_1/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_filter_intra_edge/, "uint16_t *p, int sz, int strength";
+ specialize qw/av1_highbd_filter_intra_edge sse4_1 neon/;
+ add_proto qw/void av1_highbd_upsample_intra_edge/, "uint16_t *p, int sz, int bd";
+ specialize qw/av1_highbd_upsample_intra_edge sse4_1 neon/;
+}
# CFL
add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
diff --git a/av1/common/av1_txfm.c b/av1/common/av1_txfm.c
index ac43402f4..011403b1f 100644
--- a/av1/common/av1_txfm.c
+++ b/av1/common/av1_txfm.c
@@ -15,7 +15,7 @@
#include "av1/common/av1_txfm.h"
// av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)));
-const int32_t av1_cospi_arr_data[7][64] = {
+const int32_t av1_cospi_arr_data[4][64] = {
{ 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
972, 964, 955, 946, 936, 926, 915, 903, 891, 878, 865, 851, 837,
822, 807, 792, 775, 759, 742, 724, 706, 688, 669, 650, 630, 610,
@@ -38,36 +38,153 @@ const int32_t av1_cospi_arr_data[7][64] = {
7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
- 1795, 1598, 1401, 1202, 1003, 803, 603, 402, 201 },
- { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
- 15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
- 14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
- 11297, 11003, 10702, 10394, 10080, 9760, 9434, 9102, 8765, 8423, 8076,
- 7723, 7366, 7005, 6639, 6270, 5897, 5520, 5139, 4756, 4370, 3981,
- 3590, 3196, 2801, 2404, 2006, 1606, 1205, 804, 402 },
- { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
- 31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
- 28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
- 22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
- 15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512, 8740, 7962,
- 7180, 6393, 5602, 4808, 4011, 3212, 2411, 1608, 804 },
- { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
- 63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
- 56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
- 45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
- 30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
- 14359, 12785, 11204, 9616, 8022, 6424, 4821, 3216, 1608 }
+ 1795, 1598, 1401, 1202, 1003, 803, 603, 402, 201 }
};
// av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
// << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
-const int32_t av1_sinpi_arr_data[7][5] = {
- { 0, 330, 621, 836, 951 }, { 0, 660, 1241, 1672, 1901 },
- { 0, 1321, 2482, 3344, 3803 }, { 0, 2642, 4964, 6689, 7606 },
- { 0, 5283, 9929, 13377, 15212 }, { 0, 10566, 19858, 26755, 30424 },
- { 0, 21133, 39716, 53510, 60849 }
+const int32_t av1_sinpi_arr_data[4][5] = { { 0, 330, 621, 836, 951 },
+ { 0, 660, 1241, 1672, 1901 },
+ { 0, 1321, 2482, 3344, 3803 },
+ { 0, 2642, 4964, 6689, 7606 } };
+
+// The reduced bit-width arrays are only used in the Arm Neon implementations
+// in av1_fwd_txfm2d_neon.c for now.
+#if HAVE_NEON
+// Constants are stored in groups of four, where symmetrical constants in the
+// cospi array are stored adjacent in memory, followed immediately by the same
+// constants but negated, i.e.:
+// f(i,j) = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i))) << (3-i)
+// and then in memory we store 4-tuples of constants together as:
+// f4(i,j) = [ f(i,j), f(i,64-j), -f(i,j), -f(i,64-j) ]
+//
+// Constants are stored in Q2.13 format, see:
+// https://en.wikipedia.org/wiki/Q_(number_format)
+//
+// The order of the constants is such that increasing subdivisions of 64 store
+// f4 tuples contiguously:
+// av1_cospi_arr_q13_data[i] = {
+// f4(i,32), // f(i,32) twice
+// f4(i,16), // f(i,16) and f(i,48), f4(i,32) skipped since present above.
+// f4(i,8), f(i,24), // f4(i,16) and f4(i,32) skipped since present above.
+// f4(i,4), f(i,12), f4(i,20), f4(i,28),
+// f4(i,2), f4(i,6), f4(i,10), f4(i,14), f4(i,18), ...
+// f4(i,1), f4(i,3), f4(i,5), f4(i,7), f4(i,9), f4(i,11), ...
+// }
+const int16_t av1_cospi_arr_q13_data[4][128] = {
+ {
+ 5792, 5792, -5792, -5792, 7568, 3136, -7568, -3136, 8032, 1600,
+ -8032, -1600, 6808, 4552, -6808, -4552, 8152, 800, -8152, -800,
+ 7840, 2376, -7840, -2376, 7224, 3864, -7224, -3864, 6336, 5200,
+ -6336, -5200, 8184, 400, -8184, -400, 8104, 1200, -8104, -1200,
+ 7944, 1992, -7944, -1992, 7712, 2760, -7712, -2760, 7408, 3504,
+ -7408, -3504, 7024, 4208, -7024, -4208, 6576, 4880, -6576, -4880,
+ 6072, 5504, -6072, -5504, 8192, 200, -8192, -200, 8168, 600,
+ -8168, -600, 8128, 1000, -8128, -1000, 8072, 1400, -8072, -1400,
+ 7992, 1792, -7992, -1792, 7896, 2184, -7896, -2184, 7776, 2568,
+ -7776, -2568, 7640, 2952, -7640, -2952, 7488, 3320, -7488, -3320,
+ 7320, 3680, -7320, -3680, 7128, 4040, -7128, -4040, 6920, 4384,
+ -6920, -4384, 6696, 4720, -6696, -4720, 6456, 5040, -6456, -5040,
+ 6200, 5352, -6200, -5352, 5936, 5648, -5936, -5648,
+ },
+ {
+ 5792, 5792, -5792, -5792, 7568, 3136, -7568, -3136, 8036, 1600,
+ -8036, -1600, 6812, 4552, -6812, -4552, 8152, 804, -8152, -804,
+ 7840, 2380, -7840, -2380, 7224, 3860, -7224, -3860, 6332, 5196,
+ -6332, -5196, 8184, 400, -8184, -400, 8104, 1204, -8104, -1204,
+ 7948, 1992, -7948, -1992, 7712, 2760, -7712, -2760, 7404, 3504,
+ -7404, -3504, 7028, 4212, -7028, -4212, 6580, 4880, -6580, -4880,
+ 6068, 5500, -6068, -5500, 8188, 200, -8188, -200, 8168, 604,
+ -8168, -604, 8132, 1004, -8132, -1004, 8072, 1400, -8072, -1400,
+ 7992, 1796, -7992, -1796, 7896, 2184, -7896, -2184, 7780, 2568,
+ -7780, -2568, 7644, 2948, -7644, -2948, 7488, 3320, -7488, -3320,
+ 7316, 3684, -7316, -3684, 7128, 4036, -7128, -4036, 6920, 4384,
+ -6920, -4384, 6696, 4716, -6696, -4716, 6460, 5040, -6460, -5040,
+ 6204, 5352, -6204, -5352, 5932, 5648, -5932, -5648,
+ },
+ {
+ 5792, 5792, -5792, -5792, 7568, 3134, -7568, -3134, 8034, 1598,
+ -8034, -1598, 6812, 4552, -6812, -4552, 8152, 802, -8152, -802,
+ 7840, 2378, -7840, -2378, 7224, 3862, -7224, -3862, 6332, 5196,
+ -6332, -5196, 8182, 402, -8182, -402, 8104, 1202, -8104, -1202,
+ 7946, 1990, -7946, -1990, 7714, 2760, -7714, -2760, 7406, 3502,
+ -7406, -3502, 7026, 4212, -7026, -4212, 6580, 4880, -6580, -4880,
+ 6070, 5502, -6070, -5502, 8190, 202, -8190, -202, 8170, 602,
+ -8170, -602, 8130, 1002, -8130, -1002, 8072, 1400, -8072, -1400,
+ 7992, 1794, -7992, -1794, 7896, 2184, -7896, -2184, 7778, 2570,
+ -7778, -2570, 7644, 2948, -7644, -2948, 7490, 3320, -7490, -3320,
+ 7318, 3684, -7318, -3684, 7128, 4038, -7128, -4038, 6922, 4382,
+ -6922, -4382, 6698, 4718, -6698, -4718, 6458, 5040, -6458, -5040,
+ 6204, 5350, -6204, -5350, 5934, 5648, -5934, -5648,
+ },
+ {
+ 5793, 5793, -5793, -5793, 7568, 3135, -7568, -3135, 8035, 1598,
+ -8035, -1598, 6811, 4551, -6811, -4551, 8153, 803, -8153, -803,
+ 7839, 2378, -7839, -2378, 7225, 3862, -7225, -3862, 6333, 5197,
+ -6333, -5197, 8182, 402, -8182, -402, 8103, 1202, -8103, -1202,
+ 7946, 1990, -7946, -1990, 7713, 2760, -7713, -2760, 7405, 3503,
+ -7405, -3503, 7027, 4212, -7027, -4212, 6580, 4880, -6580, -4880,
+ 6070, 5501, -6070, -5501, 8190, 201, -8190, -201, 8170, 603,
+ -8170, -603, 8130, 1003, -8130, -1003, 8071, 1401, -8071, -1401,
+ 7993, 1795, -7993, -1795, 7895, 2185, -7895, -2185, 7779, 2570,
+ -7779, -2570, 7643, 2948, -7643, -2948, 7489, 3320, -7489, -3320,
+ 7317, 3683, -7317, -3683, 7128, 4038, -7128, -4038, 6921, 4383,
+ -6921, -4383, 6698, 4717, -6698, -4717, 6458, 5040, -6458, -5040,
+ 6203, 5351, -6203, -5351, 5933, 5649, -5933, -5649,
+ }
};
+// av1_sinpi_arr_q13_data[i][j] =
+// round((sqrt2 * sin((j+1)*Pi/9) * 2/3) * (1 << (cos_bit_min + i))) << (3-i)
+// modified so that elements j=0,1 sum to element j=3.
+// See also: https://en.wikipedia.org/wiki/Q_(number_format)
+const int16_t av1_sinpi_arr_q13_data[4][4] = { { 2640, 4968, 6688, 7608 },
+ { 2640, 4964, 6688, 7604 },
+ { 2642, 4964, 6688, 7606 },
+ { 2642, 4964, 6689, 7606 } };
+
+// Constants are stored in pairs, where symmetrical constants in the
+// cospi array are stored adjacent in memory, i.e.:
+// f(i,j) = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)))
+// and then in memory we store 4-tuples of constants together as:
+// f2(i,j) = [ f(i,j), f(i,64-j) ]
+const int32_t av1_cospi_arr_s32_data[4][66] = {
+ {
+ 1024, 0, 1024, 25, 1023, 50, 1021, 75, 1019, 100, 1016,
+ 125, 1013, 150, 1009, 175, 1004, 200, 999, 224, 993, 249,
+ 987, 273, 980, 297, 972, 321, 964, 345, 955, 369, 946,
+ 392, 936, 415, 926, 438, 915, 460, 903, 483, 891, 505,
+ 878, 526, 865, 548, 851, 569, 837, 590, 822, 610, 807,
+ 630, 792, 650, 775, 669, 759, 688, 742, 706, 724, 724,
+ },
+ {
+ 2048, 0, 2047, 50, 2046, 100, 2042, 151, 2038, 201, 2033,
+ 251, 2026, 301, 2018, 350, 2009, 400, 1998, 449, 1987, 498,
+ 1974, 546, 1960, 595, 1945, 642, 1928, 690, 1911, 737, 1892,
+ 784, 1872, 830, 1851, 876, 1829, 921, 1806, 965, 1782, 1009,
+ 1757, 1053, 1730, 1096, 1703, 1138, 1674, 1179, 1645, 1220, 1615,
+ 1260, 1583, 1299, 1551, 1338, 1517, 1375, 1483, 1412, 1448, 1448,
+ },
+ {
+ 4096, 0, 4095, 101, 4091, 201, 4085, 301, 4076, 401, 4065,
+ 501, 4052, 601, 4036, 700, 4017, 799, 3996, 897, 3973, 995,
+ 3948, 1092, 3920, 1189, 3889, 1285, 3857, 1380, 3822, 1474, 3784,
+ 1567, 3745, 1660, 3703, 1751, 3659, 1842, 3612, 1931, 3564, 2019,
+ 3513, 2106, 3461, 2191, 3406, 2276, 3349, 2359, 3290, 2440, 3229,
+ 2520, 3166, 2598, 3102, 2675, 3035, 2751, 2967, 2824, 2896, 2896,
+ },
+ {
+ 8192, 0, 8190, 201, 8182, 402, 8170, 603, 8153, 803, 8130,
+ 1003, 8103, 1202, 8071, 1401, 8035, 1598, 7993, 1795, 7946, 1990,
+ 7895, 2185, 7839, 2378, 7779, 2570, 7713, 2760, 7643, 2948, 7568,
+ 3135, 7489, 3320, 7405, 3503, 7317, 3683, 7225, 3862, 7128, 4038,
+ 7027, 4212, 6921, 4383, 6811, 4551, 6698, 4717, 6580, 4880, 6458,
+ 5040, 6333, 5197, 6203, 5351, 6070, 5501, 5933, 5649, 5793, 5793,
+ }
+};
+
+#endif // HAVE_NEON
+
void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
int i;
if (bit == 0) {
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index be1164f8b..7ad70af86 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -31,13 +31,12 @@ extern "C" {
#define DO_RANGE_CHECK_CLAMP 0
#endif
-extern const int32_t av1_cospi_arr_data[7][64];
-extern const int32_t av1_sinpi_arr_data[7][5];
+extern const int32_t av1_cospi_arr_data[4][64];
+extern const int32_t av1_sinpi_arr_data[4][5];
#define MAX_TXFM_STAGE_NUM 12
static const int cos_bit_min = 10;
-static const int cos_bit_max = 16;
#define NewSqrt2Bits ((int32_t)12)
// 2^12 * sqrt(2)
@@ -53,6 +52,29 @@ static INLINE const int32_t *sinpi_arr(int n) {
return av1_sinpi_arr_data[n - cos_bit_min];
}
+// The reduced bit-width and permuted arrays are only used in the Arm Neon
+// implementations in av1_fwd_txfm2d_neon.c and highbd_fwd_txfm_neon.c for now.
+#if HAVE_NEON
+// Store cospi/sinpi costants in Q2.13 format.
+// See: https://en.wikipedia.org/wiki/Q_(number_format)
+extern const int16_t av1_cospi_arr_q13_data[4][128];
+extern const int16_t av1_sinpi_arr_q13_data[4][4];
+
+extern const int32_t av1_cospi_arr_s32_data[4][66];
+
+static INLINE const int16_t *cospi_arr_q13(int n) {
+ return av1_cospi_arr_q13_data[n - cos_bit_min];
+}
+
+static INLINE const int16_t *sinpi_arr_q13(int n) {
+ return av1_sinpi_arr_q13_data[n - cos_bit_min];
+}
+
+static INLINE const int32_t *cospi_arr_s32(int n) {
+ return av1_cospi_arr_s32_data[n - cos_bit_min];
+}
+#endif // HAVE_NEON
+
static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
const int64_t max_value = (1LL << (bit - 1)) - 1;
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 202f9d6da..12e954544 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -20,6 +20,7 @@
#include "av1/common/cdef.h"
#include "av1/common/cdef_block.h"
#include "av1/common/reconinter.h"
+#include "av1/common/thread_common.h"
static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
int mi_stride) {
@@ -413,12 +414,25 @@ void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
uint16_t **const linebuf, uint16_t **const colbuf,
uint16_t *const src, int fbr,
cdef_init_fb_row_t cdef_init_fb_row_fn,
- struct AV1CdefSyncData *const cdef_sync) {
+ struct AV1CdefSyncData *const cdef_sync,
+ struct aom_internal_error_info *error_info) {
+ // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+ // in future to handle error propagation.
+ (void)error_info;
CdefBlockInfo fb_info;
int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 };
const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
+#if CONFIG_MULTITHREAD
+ if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) {
+ pthread_mutex_lock(cdef_sync->mutex_);
+ const bool cdef_mt_exit = cdef_sync->cdef_mt_exit;
+ pthread_mutex_unlock(cdef_sync->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (cdef_mt_exit) return;
+ }
+#endif
for (int fbc = 0; fbc < nhfb; fbc++) {
fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
if (fbc != nhfb - 1)
@@ -447,5 +461,6 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
for (int fbr = 0; fbr < nvfb; fbr++)
av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
- cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL);
+ cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL,
+ xd->error_info);
}
diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index e166f4b20..a56cd9db4 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h
@@ -98,7 +98,8 @@ void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
uint16_t **const linebuf, uint16_t **const colbuf,
uint16_t *const src, int fbr,
cdef_init_fb_row_t cdef_init_fb_row_fn,
- struct AV1CdefSyncData *const cdef_sync);
+ struct AV1CdefSyncData *const cdef_sync,
+ struct aom_internal_error_info *error_info);
void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
const MACROBLOCKD *const xd,
CdefBlockInfo *const fb_info,
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 455a896f4..b5e4f124a 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -47,9 +47,6 @@ typedef void (*cdef_filter_block_func)(void *dest, int dstride,
int coeff_shift, int block_width,
int block_height);
-void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
- cdef_list *dlist, int cdef_count, int bsize);
-
void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
const uint16_t *in, int xdec, int ydec,
int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 5c62201f1..e86aa7527 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -158,6 +158,9 @@ static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
res[0] = v128_ziphi_64(tr1_7, tr1_6);
}
+// There is a separate Neon implementation of this function, so disable this
+// one.
+#if !HAVE_NEON
int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
int coeff_shift) {
int i;
@@ -196,6 +199,7 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
*var >>= 10;
return best_dir;
}
+#endif
// Work around compiler out of memory issues with Win32 builds. This issue has
// been observed with Visual Studio 2017, 2019, and 2022 (version 17.4).
@@ -205,6 +209,9 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
#define CDEF_INLINE SIMD_INLINE
#endif
+// There is a separate Neon implementation of these functions, so disable this
+// one.
+#if !HAVE_NEON
// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
unsigned int adjdamp) {
@@ -823,6 +830,7 @@ void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in,
copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
}
}
+#endif // HAVE_NEON
void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
const uint16_t *src, int sstride,
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 6d4221e84..0e37d4598 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -184,8 +184,8 @@ static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
cfl->are_parameters_computed = 1;
}
-void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
- TX_SIZE tx_size, int plane) {
+void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, int plane) {
CFL_CTX *const cfl = &xd->cfl;
MB_MODE_INFO *mbmi = xd->mi[0];
assert(is_cfl_allowed(xd));
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index af8b83353..dcaa87bd4 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -72,8 +72,8 @@ static INLINE void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) {
cfl->dc_pred_is_cached[CFL_PRED_V] = false;
}
-void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
- TX_SIZE tx_size, int plane);
+void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, int plane);
void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 9bca542a4..bb72e0cbd 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -787,6 +787,120 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
}
}
+// This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
+// optimized version for intrabc. Use the following 2-tap filter:
+// DECLARE_ALIGNED(256, static const int16_t,
+// av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+// 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// };
+void av1_highbd_convolve_2d_sr_intrabc_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+ assert(subpel_x_qn == 8);
+ assert(subpel_y_qn == 8);
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)conv_params;
+
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + 1;
+ int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+
+ // horizontal filter
+ // explicitly operate for subpel_x_qn = 8.
+ int16_t *im = im_block;
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ im[x] = sum;
+ }
+ src += src_stride;
+ im += im_stride;
+ }
+
+ // vertical filter
+ // explicitly operate for subpel_y_qn = 8.
+ int16_t *src_vert = im_block;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t sum =
+ (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ src_vert += im_stride;
+ dst += dst_stride;
+ }
+}
+
+// This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
+// optimized version for intrabc.
+void av1_highbd_convolve_y_sr_intrabc_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ int bd) {
+ assert(subpel_y_qn == 8);
+ assert(filter_params_y->taps == 2);
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+
+ // vertical filter
+ // explicitly operate for subpel_y_qn = 8.
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t res = src[x] + src[src_stride + x];
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+// This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
+// optimized version for intrabc.
+void av1_highbd_convolve_x_sr_intrabc_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ const int bits = FILTER_BITS - conv_params->round_0;
+ assert(bits >= 0);
+ assert(subpel_x_qn == 8);
+ assert(filter_params_x->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+
+ // horizontal filter
+ // explicitly operate for subpel_x_qn = 8.
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 64 * (src[x] + src[x + 1]);
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
void av1_highbd_dist_wtd_convolve_2d_c(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
@@ -1139,14 +1253,31 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
(void)dst_stride;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
- const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
- const InterpFilterParams *filter_params_x =
- need_filter_params_x ? interp_filters[0] : NULL;
- const InterpFilterParams *filter_params_y =
- need_filter_params_y ? interp_filters[1] : NULL;
+ const InterpFilterParams *filter_params_x = interp_filters[0];
+ const InterpFilterParams *filter_params_y = interp_filters[1];
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ // 2-tap filter indicates that it is for IntraBC.
+ if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert(!scaled);
+ if (subpel_x_qn && subpel_y_qn) {
+ av1_highbd_convolve_2d_sr_intrabc_c(
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+ return;
+ } else if (subpel_x_qn) {
+ av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn,
+ conv_params, bd);
+ return;
+ } else if (subpel_y_qn) {
+ av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, bd);
+ return;
+ }
+ }
+
if (scaled) {
if (conv_params->is_compound) {
assert(conv_params->dst != NULL);
@@ -1269,7 +1400,7 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h,
- const ConvolveParams *conv_params) {
+ const WienerConvolveParams *conv_params) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
@@ -1349,7 +1480,7 @@ void av1_highbd_wiener_convolve_add_src_c(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h,
- const ConvolveParams *conv_params, int bd) {
+ const WienerConvolveParams *conv_params, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 36c0c842b..d6dd8763c 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -31,6 +31,11 @@ typedef struct ConvolveParams {
int bck_offset;
} ConvolveParams;
+typedef struct WienerConvolveParams {
+ int round_0;
+ int round_1;
+} WienerConvolveParams;
+
#define ROUND0_BITS 3
#define COMPOUND_ROUND1_BITS 7
#define WIENER_ROUND0_BITS 3
@@ -99,11 +104,8 @@ static INLINE ConvolveParams get_conv_params(int do_average, int plane,
return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
}
-static INLINE ConvolveParams get_conv_params_wiener(int bd) {
- ConvolveParams conv_params;
- (void)bd;
- conv_params.do_average = 0;
- conv_params.is_compound = 0;
+static INLINE WienerConvolveParams get_conv_params_wiener(int bd) {
+ WienerConvolveParams conv_params;
conv_params.round_0 = WIENER_ROUND0_BITS;
conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
@@ -112,9 +114,6 @@ static INLINE ConvolveParams get_conv_params_wiener(int bd) {
conv_params.round_0 += intbufrange - 16;
conv_params.round_1 -= intbufrange - 16;
}
- conv_params.dst = NULL;
- conv_params.dst_stride = 0;
- conv_params.plane = 0;
return conv_params;
}
diff --git a/av1/common/enums.h b/av1/common/enums.h
index fb4d75673..b99a13867 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -27,8 +27,6 @@ extern "C" {
/*!\cond */
-#undef MAX_SB_SIZE
-
// Max superblock size
#define MAX_SB_SIZE_LOG2 7
#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
@@ -199,7 +197,7 @@ typedef char PARTITION_CONTEXT;
#define TX_PAD_END 16
#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
-// Number of maxium size transform blocks in the maximum size superblock
+// Number of maximum size transform blocks in the maximum size superblock
#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 3704b8ae3..67fb13fe5 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1048,41 +1048,6 @@ static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
p_left[-1] = s;
}
-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
- if (!strength) return;
-
- const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
- { 0, 5, 6, 5, 0 },
- { 2, 4, 4, 4, 2 } };
- const int filt = strength - 1;
- uint16_t edge[129];
-
- memcpy(edge, p, sz * sizeof(*p));
- for (int i = 1; i < sz; i++) {
- int s = 0;
- for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
- int k = i - 2 + j;
- k = (k < 0) ? 0 : k;
- k = (k > sz - 1) ? sz - 1 : k;
- s += edge[k] * kernel[filt][j];
- }
- s = (s + 8) >> 4;
- p[i] = s;
- }
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
- const int kernel[3] = { 5, 6, 5 };
-
- int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
- (p_above[0] * kernel[2]);
- s = (s + 8) >> 4;
- p_above[-1] = s;
- p_left[-1] = s;
-}
-#endif
-
void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
// interpolate half-sample positions
assert(sz <= MAX_UPSAMPLE_SZ);
@@ -1106,66 +1071,39 @@ void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
}
}
-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
- // interpolate half-sample positions
- assert(sz <= MAX_UPSAMPLE_SZ);
-
- uint16_t in[MAX_UPSAMPLE_SZ + 3];
- // copy p[-1..(sz-1)] and extend first and last samples
- in[0] = p[-1];
- in[1] = p[-1];
- for (int i = 0; i < sz; i++) {
- in[i + 2] = p[i];
- }
- in[sz + 2] = p[sz - 1];
-
- // interpolate half-sample edge positions
- p[-2] = in[0];
- for (int i = 0; i < sz; i++) {
- int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
- s = (s + 8) >> 4;
- s = clip_pixel_highbd(s, bd);
- p[2 * i - 1] = s;
- p[2 * i] = in[i + 2];
- }
-}
-#if CONFIG_AV1_HIGHBITDEPTH
-static void build_intra_predictors_high(
- const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+static void build_intra_predictors(
+ const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
- int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
- int bit_depth) {
+ int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
int i;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
- DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
- uint16_t *const above_row = above_data + 16;
- uint16_t *const left_col = left_data + 16;
+ const uint8_t *above_ref = ref - ref_stride;
+ const uint8_t *left_ref = ref - 1;
+ DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ uint8_t *const above_row = above_data + 16;
+ uint8_t *const left_col = left_data + 16;
const int txwpx = tx_size_wide[tx_size];
const int txhpx = tx_size_high[tx_size];
int need_left = extend_modes[mode] & NEED_LEFT;
int need_above = extend_modes[mode] & NEED_ABOVE;
int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
- const uint16_t *above_ref = ref - ref_stride;
- const uint16_t *left_ref = ref - 1;
const int is_dr_mode = av1_is_directional_mode(mode);
const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
- int base = 128 << (bit_depth - 8);
// The left_data, above_data buffers must be zeroed to fix some intermittent
// valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
- // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
- // seen to be the potential reason for this issue.
- aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
- aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+ // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
+ // be the potential reason for this issue.
+ memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
+ memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
// The default values if ref pixels are not available:
- // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
- // base+1 A B .. Y Z
- // base+1 C D .. W X
- // base+1 E F .. U V
- // base+1 G H .. S T T T T T
+ // 128 127 127 .. 127 127 127 127 127 127
+ // 129 A B .. Y Z
+ // 129 C D .. W X
+ // 129 E F .. U V
+ // 129 G H .. S T T T T T
+ // ..
if (is_dr_mode) {
if (p_angle <= 90)
@@ -1185,12 +1123,12 @@ static void build_intra_predictors_high(
if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
int val;
if (need_left) {
- val = (n_top_px > 0) ? above_ref[0] : base + 1;
+ val = (n_top_px > 0) ? above_ref[0] : 129;
} else {
- val = (n_left_px > 0) ? left_ref[0] : base - 1;
+ val = (n_left_px > 0) ? left_ref[0] : 127;
}
for (i = 0; i < txhpx; ++i) {
- aom_memset16(dst, val, txwpx);
+ memset(dst, val, txwpx);
dst += dst_stride;
}
return;
@@ -1209,9 +1147,9 @@ static void build_intra_predictors_high(
left_col[i] = left_ref[i * ref_stride];
}
if (i < num_left_pixels_needed)
- aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+ memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
} else if (n_top_px > 0) {
- aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
+ memset(left_col, above_ref[0], num_left_pixels_needed);
}
}
@@ -1219,19 +1157,17 @@ static void build_intra_predictors_high(
if (need_above) {
const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
if (n_top_px > 0) {
- memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+ memcpy(above_row, above_ref, n_top_px);
i = n_top_px;
if (n_topright_px > 0) {
assert(n_top_px == txwpx);
- memcpy(above_row + txwpx, above_ref + txwpx,
- n_topright_px * sizeof(above_ref[0]));
+ memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
i += n_topright_px;
}
if (i < num_top_pixels_needed)
- aom_memset16(&above_row[i], above_row[i - 1],
- num_top_pixels_needed - i);
+ memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
} else if (n_left_px > 0) {
- aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
+ memset(above_row, left_ref[0], num_top_pixels_needed);
}
}
@@ -1243,14 +1179,14 @@ static void build_intra_predictors_high(
} else if (n_left_px > 0) {
above_row[-1] = left_ref[0];
} else {
- above_row[-1] = base;
+ above_row[-1] = 128;
}
left_col[-1] = above_row[-1];
}
if (use_filter_intra) {
- highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
- filter_intra_mode, bit_depth);
+ av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode);
return;
}
@@ -1263,82 +1199,143 @@ static void build_intra_predictors_high(
if (p_angle != 90 && p_angle != 180) {
const int ab_le = need_above_left ? 1 : 0;
if (need_above && need_left && (txwpx + txhpx >= 24)) {
- filter_intra_edge_corner_high(above_row, left_col);
+ filter_intra_edge_corner(above_row, left_col);
}
if (need_above && n_top_px > 0) {
const int strength = intra_edge_filter_strength(
txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
- av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+ av1_filter_intra_edge(above_row - ab_le, n_px, strength);
}
if (need_left && n_left_px > 0) {
const int strength = intra_edge_filter_strength(
txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
- av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+ av1_filter_intra_edge(left_col - ab_le, n_px, strength);
}
}
upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
intra_edge_filter_type);
if (need_above && upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
- av1_upsample_intra_edge_high(above_row, n_px, bit_depth);
+ av1_upsample_intra_edge(above_row, n_px);
}
upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
intra_edge_filter_type);
if (need_left && upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
- av1_upsample_intra_edge_high(left_col, n_px, bit_depth);
+ av1_upsample_intra_edge(left_col, n_px);
}
}
- highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
- upsample_above, upsample_left, p_angle, bit_depth);
+ dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
+ upsample_left, p_angle);
return;
}
// predict
if (mode == DC_PRED) {
- dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
- dst, dst_stride, above_row, left_col, bit_depth);
+ dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
+ left_col);
} else {
- pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
+ pred[mode][tx_size](dst, dst_stride, above_row, left_col);
}
}
-#endif // CONFIG_AV1_HIGHBITDEPTH
-static void build_intra_predictors(
- const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 } };
+ const int filt = strength - 1;
+ uint16_t edge[129];
+
+ memcpy(edge, p, sz * sizeof(*p));
+ for (int i = 1; i < sz; i++) {
+ int s = 0;
+ for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+ int k = i - 2 + j;
+ k = (k < 0) ? 0 : k;
+ k = (k > sz - 1) ? sz - 1 : k;
+ s += edge[k] * kernel[filt][j];
+ }
+ s = (s + 8) >> 4;
+ p[i] = s;
+ }
+}
+
+static void highbd_filter_intra_edge_corner(uint16_t *p_above,
+ uint16_t *p_left) {
+ const int kernel[3] = { 5, 6, 5 };
+
+ int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+ (p_above[0] * kernel[2]);
+ s = (s + 8) >> 4;
+ p_above[-1] = s;
+ p_left[-1] = s;
+}
+
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) {
+ // interpolate half-sample positions
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint16_t in[MAX_UPSAMPLE_SZ + 3];
+ // copy p[-1..(sz-1)] and extend first and last samples
+ in[0] = p[-1];
+ in[1] = p[-1];
+ for (int i = 0; i < sz; i++) {
+ in[i + 2] = p[i];
+ }
+ in[sz + 2] = p[sz - 1];
+
+ // interpolate half-sample edge positions
+ p[-2] = in[0];
+ for (int i = 0; i < sz; i++) {
+ int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+ s = (s + 8) >> 4;
+ s = clip_pixel_highbd(s, bd);
+ p[2 * i - 1] = s;
+ p[2 * i] = in[i + 2];
+ }
+}
+
+static void highbd_build_intra_predictors(
+ const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
- int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
+ int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
+ int bit_depth) {
int i;
- const uint8_t *above_ref = ref - ref_stride;
- const uint8_t *left_ref = ref - 1;
- DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
- DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
- uint8_t *const above_row = above_data + 16;
- uint8_t *const left_col = left_data + 16;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ uint16_t *const above_row = above_data + 16;
+ uint16_t *const left_col = left_data + 16;
const int txwpx = tx_size_wide[tx_size];
const int txhpx = tx_size_high[tx_size];
int need_left = extend_modes[mode] & NEED_LEFT;
int need_above = extend_modes[mode] & NEED_ABOVE;
int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+ const uint16_t *above_ref = ref - ref_stride;
+ const uint16_t *left_ref = ref - 1;
const int is_dr_mode = av1_is_directional_mode(mode);
const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+ int base = 128 << (bit_depth - 8);
// The left_data, above_data buffers must be zeroed to fix some intermittent
// valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
- // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
- // be the potential reason for this issue.
- memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
- memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
+ // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
+ // seen to be the potential reason for this issue.
+ aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+ aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
// The default values if ref pixels are not available:
- // 128 127 127 .. 127 127 127 127 127 127
- // 129 A B .. Y Z
- // 129 C D .. W X
- // 129 E F .. U V
- // 129 G H .. S T T T T T
- // ..
+ // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+ // base+1 A B .. Y Z
+ // base+1 C D .. W X
+ // base+1 E F .. U V
+ // base+1 G H .. S T T T T T
if (is_dr_mode) {
if (p_angle <= 90)
@@ -1358,12 +1355,12 @@ static void build_intra_predictors(
if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
int val;
if (need_left) {
- val = (n_top_px > 0) ? above_ref[0] : 129;
+ val = (n_top_px > 0) ? above_ref[0] : base + 1;
} else {
- val = (n_left_px > 0) ? left_ref[0] : 127;
+ val = (n_left_px > 0) ? left_ref[0] : base - 1;
}
for (i = 0; i < txhpx; ++i) {
- memset(dst, val, txwpx);
+ aom_memset16(dst, val, txwpx);
dst += dst_stride;
}
return;
@@ -1382,9 +1379,9 @@ static void build_intra_predictors(
left_col[i] = left_ref[i * ref_stride];
}
if (i < num_left_pixels_needed)
- memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+ aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
} else if (n_top_px > 0) {
- memset(left_col, above_ref[0], num_left_pixels_needed);
+ aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
}
}
@@ -1392,17 +1389,19 @@ static void build_intra_predictors(
if (need_above) {
const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
if (n_top_px > 0) {
- memcpy(above_row, above_ref, n_top_px);
+ memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
i = n_top_px;
if (n_topright_px > 0) {
assert(n_top_px == txwpx);
- memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
+ memcpy(above_row + txwpx, above_ref + txwpx,
+ n_topright_px * sizeof(above_ref[0]));
i += n_topright_px;
}
if (i < num_top_pixels_needed)
- memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
+ aom_memset16(&above_row[i], above_row[i - 1],
+ num_top_pixels_needed - i);
} else if (n_left_px > 0) {
- memset(above_row, left_ref[0], num_top_pixels_needed);
+ aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
}
}
@@ -1414,14 +1413,14 @@ static void build_intra_predictors(
} else if (n_left_px > 0) {
above_row[-1] = left_ref[0];
} else {
- above_row[-1] = 128;
+ above_row[-1] = base;
}
left_col[-1] = above_row[-1];
}
if (use_filter_intra) {
- av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
- filter_intra_mode);
+ highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode, bit_depth);
return;
}
@@ -1434,47 +1433,48 @@ static void build_intra_predictors(
if (p_angle != 90 && p_angle != 180) {
const int ab_le = need_above_left ? 1 : 0;
if (need_above && need_left && (txwpx + txhpx >= 24)) {
- filter_intra_edge_corner(above_row, left_col);
+ highbd_filter_intra_edge_corner(above_row, left_col);
}
if (need_above && n_top_px > 0) {
const int strength = intra_edge_filter_strength(
txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
- av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+ av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
}
if (need_left && n_left_px > 0) {
const int strength = intra_edge_filter_strength(
txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
- av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+ av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
}
}
upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
intra_edge_filter_type);
if (need_above && upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
- av1_upsample_intra_edge(above_row, n_px);
+ av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
}
upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
intra_edge_filter_type);
if (need_left && upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
- av1_upsample_intra_edge(left_col, n_px);
+ av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
}
}
- dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
- upsample_left, p_angle);
+ highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ upsample_above, upsample_left, p_angle, bit_depth);
return;
}
// predict
if (mode == DC_PRED) {
- dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
- left_col);
+ dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
+ dst, dst_stride, above_row, left_col, bit_depth);
} else {
- pred[mode][tx_size](dst, dst_stride, above_row, left_col);
+ pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
}
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
int subsampling_y) {
@@ -1631,7 +1631,7 @@ void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
#if CONFIG_AV1_HIGHBITDEPTH
if (is_cur_buf_hbd(xd)) {
- build_intra_predictors_high(
+ highbd_build_intra_predictors(
ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
@@ -1696,7 +1696,7 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
} else {
cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
}
- cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
+ av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
return;
}
av1_predict_intra_block(
diff --git a/av1/common/resize.c b/av1/common/resize.c
index f4bfcd062..f89f7cacd 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -644,17 +644,20 @@ static void fill_arr_to_col_double_prec(double *img, int stride, int len,
}
}
-void av1_resize_plane(const uint8_t *const input, int height, int width,
+bool av1_resize_plane(const uint8_t *const input, int height, int width,
int in_stride, uint8_t *output, int height2, int width2,
int out_stride) {
int i;
+ bool mem_status = true;
uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
uint8_t *tmpbuf =
(uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height));
uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
- if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+ if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) {
+ mem_status = false;
goto Error;
+ }
assert(width > 0);
assert(height > 0);
assert(width2 > 0);
@@ -673,16 +676,21 @@ Error:
aom_free(tmpbuf);
aom_free(arrbuf);
aom_free(arrbuf2);
+ return mem_status;
}
-void av1_upscale_plane_double_prec(const double *const input, int height,
+bool av1_upscale_plane_double_prec(const double *const input, int height,
int width, int in_stride, double *output,
int height2, int width2, int out_stride) {
int i;
+ bool mem_status = true;
double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height);
double *arrbuf = (double *)aom_malloc(sizeof(double) * height);
double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2);
- if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
+ if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) {
+ mem_status = false;
+ goto Error;
+ }
assert(width > 0);
assert(height > 0);
assert(width2 > 0);
@@ -700,6 +708,7 @@ Error:
aom_free(intbuf);
aom_free(arrbuf);
aom_free(arrbuf2);
+ return mem_status;
}
static bool upscale_normative_rect(const uint8_t *const input, int height,
@@ -1128,35 +1137,49 @@ void av1_resize_frame420(const uint8_t *const y, int y_stride,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth) {
- av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
- av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
- owidth / 2, ouv_stride);
- av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
- owidth / 2, ouv_stride);
+ if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride))
+ abort();
+ if (!av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride))
+ abort();
+ if (!av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride))
+ abort();
}
-void av1_resize_frame422(const uint8_t *const y, int y_stride,
+bool av1_resize_frame422(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth) {
- av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
- av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
- ouv_stride);
- av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
- ouv_stride);
+ if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride))
+ return false;
+ if (!av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+ owidth / 2, ouv_stride))
+ return false;
+ if (!av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+ owidth / 2, ouv_stride))
+ return false;
+ return true;
}
-void av1_resize_frame444(const uint8_t *const y, int y_stride,
+bool av1_resize_frame444(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth) {
- av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
- av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
- ouv_stride);
- av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
- ouv_stride);
+ if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride))
+ return false;
+ if (!av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride))
+ return false;
+ if (!av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride))
+ return false;
+ return true;
}
#if CONFIG_AV1_HIGHBITDEPTH
@@ -1251,7 +1274,7 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
aom_extend_frame_borders(dst, num_planes);
}
-void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst, int bd,
const int num_planes) {
// TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
@@ -1261,25 +1284,29 @@ void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
const int is_uv = i > 0;
#if CONFIG_AV1_HIGHBITDEPTH
- if (src->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
src->crop_widths[is_uv], src->strides[is_uv],
dst->buffers[i], dst->crop_heights[is_uv],
dst->crop_widths[is_uv], dst->strides[is_uv], bd);
- else
- av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
- src->crop_widths[is_uv], src->strides[is_uv],
- dst->buffers[i], dst->crop_heights[is_uv],
- dst->crop_widths[is_uv], dst->strides[is_uv]);
+ } else if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv],
+ dst->strides[is_uv])) {
+ return false;
+ }
#else
(void)bd;
- av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
- src->crop_widths[is_uv], src->strides[is_uv],
- dst->buffers[i], dst->crop_heights[is_uv],
- dst->crop_widths[is_uv], dst->strides[is_uv]);
+ if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv]))
+ return false;
#endif
}
aom_extend_frame_borders(dst, num_planes);
+ return true;
}
void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
@@ -1410,15 +1437,19 @@ YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
cm->seq_params->bit_depth == AOM_BITS_8) {
av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
} else {
- av1_resize_and_extend_frame_nonnormative(
- unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
+ if (!av1_resize_and_extend_frame_nonnormative(
+ unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffers during resize");
}
#else
if (use_optimized_scaler && has_optimized_scaler) {
av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
} else {
- av1_resize_and_extend_frame_nonnormative(
- unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
+ if (!av1_resize_and_extend_frame_nonnormative(
+ unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffers during resize");
}
#endif
return scaled;
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 5927d8e8c..d1fab82a8 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -20,23 +20,25 @@
extern "C" {
#endif
-void av1_resize_plane(const uint8_t *const input, int height, int width,
+bool av1_resize_plane(const uint8_t *const input, int height, int width,
int in_stride, uint8_t *output, int height2, int width2,
int out_stride);
-void av1_upscale_plane_double_prec(const double *const input, int height,
+bool av1_upscale_plane_double_prec(const double *const input, int height,
int width, int in_stride, double *output,
int height2, int width2, int out_stride);
+// TODO(aomedia:3228): In libaom 4.0.0, remove av1_resize_frame420 from
+// av1/exports_com and delete this function.
void av1_resize_frame420(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth);
-void av1_resize_frame422(const uint8_t *const y, int y_stride,
+bool av1_resize_frame422(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth);
-void av1_resize_frame444(const uint8_t *const y, int y_stride,
+bool av1_resize_frame444(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
@@ -77,7 +79,7 @@ YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
const bool for_psnr, const int border_in_pixels,
const int num_pyramid_levels);
-void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst, int bd,
const int num_planes);
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 822f24094..a26f32981 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -20,6 +20,7 @@
#include "av1/common/av1_common_int.h"
#include "av1/common/resize.h"
#include "av1/common/restoration.h"
+#include "av1/common/thread_common.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
@@ -39,63 +40,43 @@ const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
{ { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
};
-PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
- PixelRect rect;
-
+void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
+ int *plane_h) {
int ss_x = is_uv && cm->seq_params->subsampling_x;
int ss_y = is_uv && cm->seq_params->subsampling_y;
-
- rect.top = 0;
- rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
- rect.left = 0;
- rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
- return rect;
+ *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+ *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
}
-// Count horizontal or vertical units per tile (use a width or height for
-// tile_size, respectively). We basically want to divide the tile size by the
+// Count horizontal or vertical units in a plane (use a width or height for
+// plane_size, respectively). We basically want to divide the plane size by the
// size of a restoration unit. Rather than rounding up unconditionally as you
// might expect, we round to nearest, which models the way a right or bottom
-// restoration unit can extend to up to 150% its normal width or height. The
-// max with 1 is to deal with tiles that are smaller than half of a restoration
-// unit.
-int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
- return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
+// restoration unit can extend to up to 150% its normal width or height.
+//
+// The max with 1 is to deal with small frames, which may be smaller than
+// half of an LR unit in size.
+int av1_lr_count_units(int unit_size, int plane_size) {
+ return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
}
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
int is_uv) {
- // We need to allocate enough space for restoration units to cover the
- // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
- // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
- // to do the computation ourselves, iterating over the tiles and keeping
- // track of the largest width and height, then upscaling.
- const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
- const int max_tile_w = tile_rect.right - tile_rect.left;
- const int max_tile_h = tile_rect.bottom - tile_rect.top;
-
- // To calculate hpertile and vpertile (horizontal and vertical units per
- // tile), we basically want to divide the largest tile width or height by the
- // size of a restoration unit. Rather than rounding up unconditionally as you
- // might expect, we round to nearest, which models the way a right or bottom
- // restoration unit can extend to up to 150% its normal width or height. The
- // max with 1 is to deal with tiles that are smaller than half of a
- // restoration unit.
- const int unit_size = rsi->restoration_unit_size;
- const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
- const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
- rsi->units_per_tile = hpertile * vpertile;
- rsi->horz_units_per_tile = hpertile;
- rsi->vert_units_per_tile = vpertile;
+ const int unit_size = rsi->restoration_unit_size;
+ const int horz_units = av1_lr_count_units(unit_size, plane_w);
+ const int vert_units = av1_lr_count_units(unit_size, plane_h);
- const int ntiles = 1;
- const int nunits = ntiles * rsi->units_per_tile;
+ rsi->num_rest_units = horz_units * vert_units;
+ rsi->horz_units = horz_units;
+ rsi->vert_units = vert_units;
aom_free(rsi->unit_info);
CHECK_MEM_ERROR(cm, rsi->unit_info,
(RestorationUnitInfo *)aom_memalign(
- 16, sizeof(*rsi->unit_info) * nunits));
+ 16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
}
void av1_free_restoration_struct(RestorationInfo *rst_info) {
@@ -174,8 +155,9 @@ static void extend_frame_highbd(uint16_t *data, int width, int height,
}
}
-static void copy_tile_highbd(int width, int height, const uint16_t *src,
- int src_stride, uint16_t *dst, int dst_stride) {
+static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
+ int src_stride, uint16_t *dst,
+ int dst_stride) {
for (int i = 0; i < height; ++i)
memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
}
@@ -194,23 +176,24 @@ void av1_extend_frame(uint8_t *data, int width, int height, int stride,
extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
}
-static void copy_tile_lowbd(int width, int height, const uint8_t *src,
- int src_stride, uint8_t *dst, int dst_stride) {
+static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride) {
for (int i = 0; i < height; ++i)
memcpy(dst + i * dst_stride, src + i * src_stride, width);
}
-static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride, int highbd) {
+static void copy_rest_unit(int width, int height, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int highbd) {
#if CONFIG_AV1_HIGHBITDEPTH
if (highbd) {
- copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
- CONVERT_TO_SHORTPTR(dst), dst_stride);
+ copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+ CONVERT_TO_SHORTPTR(dst), dst_stride);
return;
}
#endif
(void)highbd;
- copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+ copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
}
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
@@ -218,53 +201,34 @@ static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
// With striped loop restoration, the filtering for each 64-pixel stripe gets
// most of its input from the output of CDEF (stored in data8), but we need to
// fill out a border of 3 pixels above/below the stripe according to the
-// following
-// rules:
-//
-// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
-// This extension is done by a call to av1_extend_frame() at the start of the
-// loop restoration process, so the value of copy_above/copy_below doesn't
-// strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
-// loop filtering across tiles is disabled, we can allow
-// {setup,restore}_processing_stripe_boundary to assume that the top/bottom
-// data has always been copied, simplifying the behaviour at the left and
-// right edges of tiles.
+// following rules:
//
-// * If we're at a tile boundary and loop filtering across tiles is enabled,
-// then there is a logical stripe which is 64 pixels high, but which is split
-// into an 8px high and a 56px high stripe so that the processing (and
-// coefficient set usage) can be aligned to tiles.
-// In this case, we use the 3 rows of CDEF output across the boundary for
-// context; this corresponds to leaving the frame buffer as-is.
+// * At the top and bottom of the frame, we copy the outermost row of CDEF
+// pixels three times. This extension is done by a call to av1_extend_frame()
+// at the start of the loop restoration process, so the value of
+// copy_above/copy_below doesn't strictly matter.
//
-// * If we're at a tile boundary and loop filtering across tiles is disabled,
-// then we take the outermost row of CDEF pixels *within the current tile*
-// and copy it three times. Thus we behave exactly as if the tile were a full
-// frame.
-//
-// * Otherwise, we're at a stripe boundary within a tile. In that case, we
-// take 2 rows of deblocked pixels and extend them to 3 rows of context.
-//
-// The distinction between the latter two cases is handled by the
-// av1_loop_restoration_save_boundary_lines() function, so here we just need
-// to decide if we're overwriting the above/below boundary pixels or not.
+// * All other boundaries are stripe boundaries within the frame. In that case,
+// we take 2 rows of deblocked pixels and extend them to 3 rows of context.
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
- const PixelRect *tile_rect, int ss_y,
+ int plane_w, int plane_h, int ss_y,
int *copy_above, int *copy_below) {
+ (void)plane_w;
+
*copy_above = 1;
*copy_below = 1;
const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
- const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
+ const int first_stripe_in_plane = (limits->v_start == 0);
const int this_stripe_height =
- full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
- const int last_stripe_in_tile =
- (limits->v_start + this_stripe_height >= tile_rect->bottom);
+ full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
+ const int last_stripe_in_plane =
+ (limits->v_start + this_stripe_height >= plane_h);
- if (first_stripe_in_tile) *copy_above = 0;
- if (last_stripe_in_tile) *copy_below = 0;
+ if (first_stripe_in_plane) *copy_above = 0;
+ if (last_stripe_in_plane) *copy_below = 0;
}
// Overwrite the border pixels around a processing stripe so that the conditions
@@ -276,10 +240,6 @@ static void get_stripe_boundary_info(const RestorationTileLimits *limits,
// limits gives the rectangular limits of the remaining stripes for the current
// restoration unit. rsb is the stored stripe boundaries (taken from either
// deblock or CDEF output as necessary).
-//
-// tile_rect is the limits of the current tile and tile_stripe0 is the index of
-// the first stripe in this tile (needed to convert the tile-relative stripe
-// index we get from limits into something we can look up in rsb).
static void setup_processing_stripe_boundary(
const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
@@ -300,12 +260,6 @@ static void setup_processing_stripe_boundary(
// to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
// duplicating the topmost of the 2 lines (see the AOMMAX call when
// calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
- //
- // Special case: If we're at the top of a tile, which isn't on the topmost
- // tile row, and we're allowed to loop filter across tiles, then we have a
- // logical 64-pixel-high stripe which has been split into an 8-pixel high
- // stripe and a 56-pixel high stripe (the current one). So, in this case,
- // we want to leave the boundary alone!
if (!opt) {
if (copy_above) {
uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
@@ -370,19 +324,9 @@ static void setup_processing_stripe_boundary(
}
}
-// This function restores the boundary lines modified by
-// setup_processing_stripe_boundary.
-//
-// Note: We need to be careful when handling the corners of the processing
-// unit, because (eg.) the top-left corner is considered to be part of
-// both the left and top borders. This means that, depending on the
-// loop_filter_across_tiles_enabled flag, the corner pixels might get
-// overwritten twice, once as part of the "top" border and once as part
-// of the "left" border (or similar for other corners).
-//
-// Everything works out fine as long as we make sure to reverse the order
-// when restoring, ie. we need to restore the left/right borders followed
-// by the top/bottom borders.
+// Once a processing stripe is finished, this function sets the boundary
+// pixels which were overwritten by setup_processing_stripe_boundary()
+// back to their original values
static void restore_processing_stripe_boundary(
const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
@@ -440,11 +384,13 @@ static void wiener_filter_stripe(const RestorationUnitInfo *rui,
int stripe_width, int stripe_height,
int procunit_width, const uint8_t *src,
int src_stride, uint8_t *dst, int dst_stride,
- int32_t *tmpbuf, int bit_depth) {
+ int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
(void)tmpbuf;
(void)bit_depth;
+ (void)error_info;
assert(bit_depth == 8);
- const ConvolveParams conv_params = get_conv_params_wiener(8);
+ const WienerConvolveParams conv_params = get_conv_params_wiener(8);
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
@@ -908,19 +854,18 @@ int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
return 0;
}
-void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
const int ret = av1_selfguided_restoration_c(
dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
- (void)ret;
- assert(!ret);
+ if (ret != 0) return ret;
const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
av1_decode_xq(xqd, xq, params);
@@ -947,33 +892,40 @@ void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
*dst8ij = (uint8_t)out;
}
}
+ return 0;
}
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
int stripe_width, int stripe_height,
int procunit_width, const uint8_t *src,
int src_stride, uint8_t *dst, int dst_stride,
- int32_t *tmpbuf, int bit_depth) {
+ int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
(void)bit_depth;
assert(bit_depth == 8);
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
- av1_apply_selfguided_restoration(
- src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
- rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
+ if (av1_apply_selfguided_restoration(
+ src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+ rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
+ 0) != 0) {
+ aom_internal_error(
+ error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffer in av1_apply_selfguided_restoration");
+ }
}
}
#if CONFIG_AV1_HIGHBITDEPTH
-static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
- int stripe_width, int stripe_height,
- int procunit_width, const uint8_t *src8,
- int src_stride, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth) {
+static void wiener_filter_stripe_highbd(
+ const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
(void)tmpbuf;
- const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+ (void)error_info;
+ const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
@@ -986,17 +938,21 @@ static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
}
}
-static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
- int stripe_width, int stripe_height,
- int procunit_width,
- const uint8_t *src8, int src_stride,
- uint8_t *dst8, int dst_stride,
- int32_t *tmpbuf, int bit_depth) {
+static void sgrproj_filter_stripe_highbd(
+ const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
- av1_apply_selfguided_restoration(
- src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
- rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+ if (av1_apply_selfguided_restoration(
+ src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+ rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
+ 1) != 0) {
+ aom_internal_error(
+ error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffer in av1_apply_selfguided_restoration");
+ }
}
}
#endif // CONFIG_AV1_HIGHBITDEPTH
@@ -1005,7 +961,8 @@ typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
int stripe_width, int stripe_height,
int procunit_width, const uint8_t *src,
int src_stride, uint8_t *dst, int dst_stride,
- int32_t *tmpbuf, int bit_depth);
+ int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info);
#if CONFIG_AV1_HIGHBITDEPTH
#define NUM_STRIPE_FILTERS 4
@@ -1024,9 +981,9 @@ static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
void av1_loop_restoration_filter_unit(
const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
- const PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
- int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+ int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
+ uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
+ int optimized_lr, struct aom_internal_error_info *error_info) {
RestorationType unit_rtype = rui->restoration_type;
int unit_h = limits->v_end - limits->v_start;
@@ -1035,7 +992,8 @@ void av1_loop_restoration_filter_unit(
uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
if (unit_rtype == RESTORE_NONE) {
- copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
+ copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
+ highbd);
return;
}
@@ -1045,32 +1003,30 @@ void av1_loop_restoration_filter_unit(
const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
- // Convolve the whole tile one stripe at a time
+ // Filter the whole image one stripe at a time
RestorationTileLimits remaining_stripes = *limits;
int i = 0;
while (i < unit_h) {
int copy_above, copy_below;
remaining_stripes.v_start = limits->v_start + i;
- get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
- &copy_below);
+ get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
+ &copy_above, &copy_below);
const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
// Work out where this stripe's boundaries are within
// rsb->stripe_boundary_{above,below}
- const int tile_stripe =
- (remaining_stripes.v_start - tile_rect->top + runit_offset) /
- full_stripe_height;
- const int frame_stripe = tile_stripe0 + tile_stripe;
+ const int frame_stripe =
+ (remaining_stripes.v_start + runit_offset) / full_stripe_height;
const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
// Calculate this stripe's height, based on two rules:
- // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
+ // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
// * We can't extend past the end of the current restoration unit
const int nominal_stripe_height =
- full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
+ full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
const int h = AOMMIN(nominal_stripe_height,
remaining_stripes.v_end - remaining_stripes.v_start);
@@ -1079,7 +1035,8 @@ void av1_loop_restoration_filter_unit(
copy_below, optimized_lr);
stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
- dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
+ dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
+ error_info);
restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
data8, stride, copy_above, copy_below,
@@ -1090,17 +1047,17 @@ void av1_loop_restoration_filter_unit(
}
static void filter_frame_on_unit(const RestorationTileLimits *limits,
- const PixelRect *tile_rect, int rest_unit_idx,
- void *priv, int32_t *tmpbuf,
- RestorationLineBuffers *rlbs) {
+ int rest_unit_idx, void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
const RestorationInfo *rsi = ctxt->rsi;
av1_loop_restoration_filter_unit(
- limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
- ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
- ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
- rsi->optimized_lr);
+ limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
+ ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
+ ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
+ ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
}
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
@@ -1127,31 +1084,33 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
RestorationInfo *rsi = &cm->rst_info[plane];
RestorationType rtype = rsi->frame_restoration_type;
rsi->optimized_lr = optimized_lr;
+ lr_ctxt->ctxt[plane].rsi = rsi;
if (rtype == RESTORE_NONE) {
continue;
}
const int is_uv = plane > 0;
- const int plane_width = frame->crop_widths[is_uv];
- const int plane_height = frame->crop_heights[is_uv];
- FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ assert(plane_w == frame->crop_widths[is_uv]);
+ assert(plane_h == frame->crop_heights[is_uv]);
- av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
+ av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
frame->strides[is_uv], RESTORATION_BORDER,
RESTORATION_BORDER, highbd);
- lr_plane_ctxt->rsi = rsi;
+ FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
+ lr_plane_ctxt->plane_w = plane_w;
+ lr_plane_ctxt->plane_h = plane_h;
lr_plane_ctxt->highbd = highbd;
lr_plane_ctxt->bit_depth = bit_depth;
lr_plane_ctxt->data8 = frame->buffers[plane];
lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
lr_plane_ctxt->data_stride = frame->strides[is_uv];
lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
- lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
- lr_plane_ctxt->tile_stripe0 = 0;
}
}
@@ -1166,9 +1125,9 @@ void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
assert(num_planes <= 3);
for (int plane = 0; plane < num_planes; ++plane) {
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
- PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
- copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
- tile_rect.right, tile_rect.top, tile_rect.bottom);
+ FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
+ copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
+ lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
}
}
@@ -1182,8 +1141,7 @@ static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
}
av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
- &ctxt[plane], &ctxt[plane].tile_rect,
- cm->rst_tmpbuf, cm->rlbs);
+ &ctxt[plane], cm->rst_tmpbuf, cm->rlbs);
}
}
@@ -1204,24 +1162,23 @@ void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
}
void av1_foreach_rest_unit_in_row(
- RestorationTileLimits *limits, const PixelRect *tile_rect,
+ RestorationTileLimits *limits, int plane_w,
rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
- int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
- void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
- sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
- struct AV1LrSyncData *const lr_sync) {
- const int tile_w = tile_rect->right - tile_rect->left;
+ int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
+ sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
+ struct aom_internal_error_info *error_info) {
const int ext_size = unit_size * 3 / 2;
int x0 = 0, j = 0;
- while (x0 < tile_w) {
- int remaining_w = tile_w - x0;
+ while (x0 < plane_w) {
+ int remaining_w = plane_w - x0;
int w = (remaining_w < ext_size) ? remaining_w : unit_size;
- limits->h_start = tile_rect->left + x0;
- limits->h_end = tile_rect->left + x0 + w;
- assert(limits->h_end <= tile_rect->right);
+ limits->h_start = x0;
+ limits->h_end = x0 + w;
+ assert(limits->h_end <= plane_w);
- const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
+ const int unit_idx = row_number * hnum_rest_units + j;
// No sync for even numbered rows
// For odd numbered rows, Loop Restoration of current block requires the LR
@@ -1229,13 +1186,23 @@ void av1_foreach_rest_unit_in_row(
// top-right sync
on_sync_read(lr_sync, row_number, j, plane);
- if ((row_number + 1) < vunits_per_tile)
+ if ((row_number + 1) < vnum_rest_units)
// bottom-right sync
on_sync_read(lr_sync, row_number + 2, j, plane);
- on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+#if CONFIG_MULTITHREAD
+ if (lr_sync && lr_sync->num_workers > 1) {
+ pthread_mutex_lock(lr_sync->job_mutex);
+ const bool lr_mt_exit = lr_sync->lr_mt_exit;
+ pthread_mutex_unlock(lr_sync->job_mutex);
+ // Exit in case any worker has encountered an error.
+ if (lr_mt_exit) return;
+ }
+#endif
+
+ on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
- on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
+ on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
x0 += w;
++j;
@@ -1258,57 +1225,45 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
(void)plane;
}
-static void foreach_rest_unit_in_tile(
- const PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
- int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
- int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
- int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
- const int tile_h = tile_rect->bottom - tile_rect->top;
- const int ext_size = unit_size * 3 / 2;
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+ rest_unit_visitor_t on_rest_unit,
+ void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ const int hnum_rest_units = rsi->horz_units;
+ const int vnum_rest_units = rsi->vert_units;
+ const int unit_size = rsi->restoration_unit_size;
- const int tile_idx = tile_col + tile_row * tile_cols;
- const int unit_idx0 = tile_idx * units_per_tile;
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ const int ext_size = unit_size * 3 / 2;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
int y0 = 0, i = 0;
- while (y0 < tile_h) {
- int remaining_h = tile_h - y0;
+ while (y0 < plane_h) {
+ int remaining_h = plane_h - y0;
int h = (remaining_h < ext_size) ? remaining_h : unit_size;
RestorationTileLimits limits;
- limits.v_start = tile_rect->top + y0;
- limits.v_end = tile_rect->top + y0 + h;
- assert(limits.v_end <= tile_rect->bottom);
- // Offset the tile upwards to align with the restoration processing stripe
+ limits.v_start = y0;
+ limits.v_end = y0 + h;
+ assert(limits.v_end <= plane_h);
+ // Offset upwards to align with the restoration processing stripe
const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
- limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
- if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
+ limits.v_start = AOMMAX(0, limits.v_start - voffset);
+ if (limits.v_end < plane_h) limits.v_end -= voffset;
- av1_foreach_rest_unit_in_row(
- &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
- hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
- av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
+ av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
+ hnum_rest_units, vnum_rest_units, plane, priv,
+ tmpbuf, rlbs, av1_lr_sync_read_dummy,
+ av1_lr_sync_write_dummy, NULL, cm->error);
y0 += h;
++i;
}
}
-void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
- rest_unit_visitor_t on_rest_unit,
- void *priv, PixelRect *tile_rect,
- int32_t *tmpbuf,
- RestorationLineBuffers *rlbs) {
- const int is_uv = plane > 0;
- const int ss_y = is_uv && cm->seq_params->subsampling_y;
-
- const RestorationInfo *rsi = &cm->rst_info[plane];
-
- foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
- rsi->horz_units_per_tile, rsi->vert_units_per_tile,
- rsi->units_per_tile, rsi->restoration_unit_size,
- ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
-}
-
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int *rcol0, int *rcol1, int *rrow0,
@@ -1316,33 +1271,21 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
assert(rcol0 && rcol1 && rrow0 && rrow1);
if (bsize != cm->seq_params->sb_size) return 0;
- if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
assert(!cm->features.all_lossless);
const int is_uv = plane > 0;
- const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
- const int tile_w = tile_rect.right - tile_rect.left;
- const int tile_h = tile_rect.bottom - tile_rect.top;
-
- const int mi_top = 0;
- const int mi_left = 0;
-
- // Compute the mi-unit corners of the superblock relative to the top-left of
- // the tile
- const int mi_rel_row0 = mi_row - mi_top;
- const int mi_rel_col0 = mi_col - mi_left;
- const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
- const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
+ // Compute the mi-unit corners of the superblock
+ const int mi_row0 = mi_row;
+ const int mi_col0 = mi_col;
+ const int mi_row1 = mi_row0 + mi_size_high[bsize];
+ const int mi_col1 = mi_col0 + mi_size_wide[bsize];
const RestorationInfo *rsi = &cm->rst_info[plane];
const int size = rsi->restoration_unit_size;
-
- // Calculate the number of restoration units in this tile (which might be
- // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
- const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
- const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
+ const int horz_units = rsi->horz_units;
+ const int vert_units = rsi->vert_units;
// The size of an MI-unit on this plane of the image
const int ss_x = is_uv && cm->seq_params->subsampling_x;
@@ -1367,19 +1310,18 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
const int rnd_x = denom_x - 1;
const int rnd_y = denom_y - 1;
- // rcol0/rrow0 should be the first column/row of restoration units (relative
- // to the top-left of the tile) that doesn't start left/below of
- // mi_col/mi_row. For this calculation, we need to round up the division (if
- // the sb starts at runit column 10.1, the first matching runit has column
- // index 11)
- *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
- *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
+ // rcol0/rrow0 should be the first column/row of restoration units that
+ // doesn't start left/below of mi_col/mi_row. For this calculation, we need
+ // to round up the division (if the sb starts at runit column 10.1, the first
+ // matching runit has column index 11)
+ *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
+ *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
// rel_col1/rel_row1 is the equivalent calculation, but for the superblock
- // below-right. If we're at the bottom or right of the tile, this restoration
+ // below-right. If we're at the bottom or right of the frame, this restoration
// unit might not exist, in which case we'll clamp accordingly.
- *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
- *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+ *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+ *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
return *rcol0 < *rcol1 && *rrow0 < *rrow1;
}
@@ -1480,73 +1422,59 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
: src_width;
const int line_bytes = upscaled_width << use_highbd;
for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
- // Copy the line at 'row' into both context lines. This is because
- // we want to (effectively) extend the outermost row of CDEF data
- // from this tile to produce a border, rather than using deblocked
- // pixels from the tile above/below.
+ // Copy the line at 'src_rows' into both context lines
memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
}
extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
RESTORATION_EXTRA_HORZ, use_highbd);
}
-static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
- int use_highbd, int plane,
- AV1_COMMON *cm, int after_cdef) {
+static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
+ int plane, AV1_COMMON *cm, int after_cdef) {
const int is_uv = plane > 0;
const int ss_y = is_uv && cm->seq_params->subsampling_y;
const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
- // Get the tile rectangle, with height rounded up to the next multiple of 8
- // luma pixels (only relevant for the bottom tile of the frame)
- const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
- const int stripe0 = 0;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
- int tile_stripe;
- for (tile_stripe = 0;; ++tile_stripe) {
- const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
- const int y0 = tile_rect.top + rel_y0;
- if (y0 >= tile_rect.bottom) break;
+ int stripe_idx;
+ for (stripe_idx = 0;; ++stripe_idx) {
+ const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
+ const int y0 = rel_y0;
+ if (y0 >= plane_h) break;
- const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
- const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
+ const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
+ const int y1 = AOMMIN(rel_y1, plane_h);
- const int frame_stripe = stripe0 + tile_stripe;
-
- // In this case, we should only use CDEF pixels at the top
- // and bottom of the frame as a whole; internal tile boundaries
- // can use deblocked pixels from adjacent tiles for context.
- const int use_deblock_above = (frame_stripe > 0);
+ // Extend using CDEF pixels at the top and bottom of the frame,
+ // and deblocked pixels at internal stripe boundaries
+ const int use_deblock_above = (stripe_idx > 0);
const int use_deblock_below = (y1 < plane_height);
if (!after_cdef) {
- // Save deblocked context where needed.
+ // Save deblocked context at internal stripe boundaries
if (use_deblock_above) {
save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
- frame_stripe, use_highbd, 1, boundaries);
+ stripe_idx, use_highbd, 1, boundaries);
}
if (use_deblock_below) {
- save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
+ save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
use_highbd, 0, boundaries);
}
} else {
- // Save CDEF context where needed. Note that we need to save the CDEF
- // context for a particular boundary iff we *didn't* save deblocked
- // context for that boundary.
- //
- // In addition, we need to save copies of the outermost line within
- // the tile, rather than using data from outside the tile.
+ // Save CDEF context at frame boundaries
if (!use_deblock_above) {
- save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
+ save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1, boundaries);
}
if (!use_deblock_below) {
- save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
+ save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
use_highbd, 0, boundaries);
}
}
@@ -1561,6 +1489,6 @@ void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
const int num_planes = av1_num_planes(cm);
const int use_highbd = cm->seq_params->use_highbitdepth;
for (int p = 0; p < num_planes; ++p) {
- save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+ save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
}
}
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index bf2130341..d5da81d1d 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -33,7 +33,7 @@ extern "C" {
#define RESTORATION_PROC_UNIT_SIZE 64
-// Filter tile grid offset upwards compared to the superblock grid
+// Filter stripe grid offset upwards compared to the superblock grid
#define RESTORATION_UNIT_OFFSET 8
#define SGRPROJ_BORDER_VERT 3 // Vertical border used for Sgr
@@ -180,10 +180,6 @@ extern "C" {
#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
#endif
-#define LR_TILE_ROW 0
-#define LR_TILE_COL 0
-#define LR_TILE_COLS 1
-
typedef struct {
int r[2]; // radii
int s[2]; // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
@@ -215,12 +211,6 @@ typedef struct {
#define RESTORATION_LINEBUFFER_WIDTH \
(RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
-// Similarly, the column buffers (used when we're at a vertical tile edge
-// that we can't filter across) need space for one processing unit's worth
-// of pixels, plus the top/bottom border width
-#define RESTORATION_COLBUFFER_HEIGHT \
- (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER)
-
typedef struct {
// Temporary buffers to save/restore 3 lines above/below the restoration
// stripe.
@@ -266,32 +256,26 @@ typedef struct {
/**
* \name Fields allocated and initialised by av1_alloc_restoration_struct.
- * (horz_)units_per_tile give the number of restoration units in
- * (one row of) the largest tile in the frame.
*/
/**@{*/
/*!
- * Number of units per tile for the largest tile in the frame
+ * Total number of restoration units in this plane
*/
- int units_per_tile;
+ int num_rest_units;
/*!
- * Number of vertical units per tile
+ * Number of vertical restoration units in this plane
*/
- int vert_units_per_tile;
+ int vert_units;
/*!
- * Number of horizontal units per tile for the largest tile in the frame
+ * Number of horizontal restoration units in this plane
*/
- int horz_units_per_tile;
+ int horz_units;
/**@}*/
/*!
- * List of info for units in tile.
- * The data in unit_info is laid out with units_per_tile entries for each
- * tile, which have stride horz_units_per_tile.
- * Even if there are tiles of different sizes, the data in unit_info is
- * laid out as if all tiles are of full size.
+ * Parameters for each restoration unit in this plane
*/
RestorationUnitInfo *unit_info;
@@ -332,19 +316,18 @@ typedef struct {
} RestorationTileLimits;
typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
- const PixelRect *tile_rect,
int rest_unit_idx, void *priv,
int32_t *tmpbuf,
- RestorationLineBuffers *rlbs);
+ RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info);
typedef struct FilterFrameCtxt {
const RestorationInfo *rsi;
- int tile_stripe0;
int ss_x, ss_y;
+ int plane_w, plane_h;
int highbd, bit_depth;
uint8_t *data8, *dst8;
int data_stride, dst_stride;
- PixelRect tile_rect;
} FilterFrameCtxt;
typedef struct AV1LrStruct {
@@ -375,27 +358,29 @@ void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
* This function applies the loop restoration filter to a single
* loop restoration unit.
*
- * \param[in] limits Limits of the unit
- * \param[in] rui The parameters to use for this unit and its
- * coefficients
- * \param[in] rsb Deblocked pixels to use for stripe boundaries
- * \param[in] rlbs Space to use as a scratch buffer
- * \param[in] tile_rect Limits of the tile containing this unit
- * \param[in] tile_stripe0 Index of the first stripe in this tile
- * \param[in] ss_x Horizontal subsampling for plane
- * \param[in] ss_y Vertical subsampling for plane
- * \param[in] highbd Whether high bitdepth pipeline is used
- * \param[in] bit_depth Bit-depth of the video
- * \param[in] data8 Frame data (pointing at the top-left corner of
- * the frame, not the restoration unit).
- * \param[in] stride Stride of \c data8
- * \param[out] dst8 Buffer where the results will be written. Like
- * \c data8, \c dst8 should point at the top-left
- * corner of the frame
- * \param[in] dst_stride Stride of \c dst8
- * \param[in] tmpbuf Scratch buffer used by the sgrproj filter which
- * should be at least SGRPROJ_TMPBUF_SIZE big.
- * \param[in] optimized_lr Whether to use fast optimized Loop Restoration
+ * \param[in] limits Limits of the unit
+ * \param[in] rui The parameters to use for this unit and its
+ * coefficients
+ * \param[in] rsb Deblocked pixels to use for stripe boundaries
+ * \param[in] rlbs Space to use as a scratch buffer
+ * \param[in] ss_x Horizontal subsampling for plane
+ * \param[in] ss_y Vertical subsampling for plane
+ * \param[in] plane_w Width of the current plane
+ * \param[in] plane_h Height of the current plane
+ * \param[in] highbd Whether high bitdepth pipeline is used
+ * \param[in] bit_depth Bit-depth of the video
+ * \param[in] data8 Frame data (pointing at the top-left corner of
+ * the frame, not the restoration unit).
+ * \param[in] stride Stride of \c data8
+ * \param[out] dst8 Buffer where the results will be written. Like
+ * \c data8, \c dst8 should point at the top-left
+ * corner of the frame
+ * \param[in] dst_stride Stride of \c dst8
+ * \param[in] tmpbuf Scratch buffer used by the sgrproj filter
+ * which should be at least SGRPROJ_TMPBUF_SIZE
+ * big.
+ * \param[in] optimized_lr Whether to use fast optimized Loop Restoration
+ * \param[in,out] error_info Error info for reporting errors
*
* \remark Nothing is returned. Instead, the filtered unit is output in
* \c dst8 at the proper restoration unit offset.
@@ -403,17 +388,17 @@ void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
void av1_loop_restoration_filter_unit(
const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
- const PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
- int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf, int optimized_lr);
+ int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
+ uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
+ int optimized_lr, struct aom_internal_error_info *error_info);
/*!\brief Function for applying loop restoration filter to a frame
*
* \ingroup in_loop_restoration
* This function applies the loop restoration filter to a frame.
*
- * \param[in, out] frame Compressed frame buffer
- * \param[in, out] cm Pointer to top level common structure
+ * \param[in,out] frame Compressed frame buffer
+ * \param[in,out] cm Pointer to top level common structure
* \param[in] optimized_lr Whether to use fast optimized Loop Restoration
* \param[in] lr_ctxt Loop restoration context
*
@@ -427,8 +412,6 @@ void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
void av1_loop_restoration_precal();
-typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
- void *priv);
struct AV1LrSyncData;
typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
@@ -439,8 +422,7 @@ typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
// Call on_rest_unit for each loop restoration unit in the plane.
void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
rest_unit_visitor_t on_rest_unit,
- void *priv, PixelRect *tile_rect,
- int32_t *tmpbuf,
+ void *priv, int32_t *tmpbuf,
RestorationLineBuffers *rlbs);
// Return 1 iff the block at mi_row, mi_col with size bsize is a
@@ -448,10 +430,9 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
// loop restoration unit.
//
// If the block is a top-level superblock, the function writes to
-// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit
-// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative
-// to the current tile, whose starting index is returned as
-// *tile_tl_idx.
+// *rcol0, *rcol1, *rrow0, *rrow1. This means that the parameters for all
+// restoration units in the rectangle [*rcol0, *rcol1) x [*rrow0, *rrow1)
+// are signaled in this superblock.
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int *rcol0, int *rcol1, int *rrow0,
@@ -467,14 +448,16 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
struct AV1Common *cm, int num_planes);
void av1_foreach_rest_unit_in_row(
- RestorationTileLimits *limits, const PixelRect *tile_rect,
+ RestorationTileLimits *limits, int plane_w,
rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
- int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
- void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
- sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
- struct AV1LrSyncData *const lr_sync);
-PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
-int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+ int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
+ sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
+ struct aom_internal_error_info *error_info);
+
+void av1_get_upsampled_plane_size(const struct AV1Common *cm, int is_uv,
+ int *plane_w, int *plane_h);
+int av1_lr_count_units(int unit_size, int plane_size);
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
const int sb_cols, int plane);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 6a4427b54..8a6f290a9 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -57,6 +57,7 @@ static INLINE int get_lr_sync_range(int width) {
void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
int width, int num_workers) {
lf_sync->rows = rows;
+ lf_sync->lf_mt_exit = false;
#if CONFIG_MULTITHREAD
{
int i, j;
@@ -252,8 +253,12 @@ void av1_thread_loop_filter_rows(
const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+ struct aom_internal_error_info *error_info,
AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf,
int num_mis_in_lpf_unit_height_log2) {
+ // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+ // in future to handle error propagation.
+ (void)error_info;
const int sb_cols =
CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
const int r = mi_row >> num_mis_in_lpf_unit_height_log2;
@@ -300,6 +305,16 @@ void av1_thread_loop_filter_rows(
sync_read(lf_sync, r + 1, c, plane);
}
+#if CONFIG_MULTITHREAD
+ if (lf_sync && lf_sync->num_workers > 1) {
+ pthread_mutex_lock(lf_sync->job_mutex);
+ const bool lf_mt_exit = lf_sync->lf_mt_exit;
+ pthread_mutex_unlock(lf_sync->job_mutex);
+ // Exit in case any worker has encountered an error.
+ if (lf_mt_exit) return;
+ }
+#endif
+
av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
mi_row, mi_col, plane, plane + num_planes);
if (lpf_opt_level) {
@@ -320,27 +335,93 @@ void av1_thread_loop_filter_rows(
}
}
+void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync,
+ int num_mis_in_lpf_unit_height_log2) {
+ int plane, sb_row;
+ const int sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, num_mis_in_lpf_unit_height_log2);
+ const int sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
+
+ // In case of loopfilter row-multithreading, the worker on an SB row waits for
+ // the vertical edge filtering of the right and top-right SBs. Hence, in case
+ // a thread (main/worker) encounters an error, update that vertical
+ // loopfiltering of every SB row in the frame is complete in order to avoid
+ // dependent workers waiting indefinitely.
+ for (sb_row = 0; sb_row < sb_rows; ++sb_row)
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+ sync_write(lf_sync, sb_row, sb_cols - 1, sb_cols, plane);
+}
+
+static AOM_INLINE void sync_lf_workers(AVxWorker *const workers,
+ AV1_COMMON *const cm, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int had_error = workers[0].had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ AVxWorker *const worker = &workers[0];
+ error_info = ((LFWorkerData *)worker->data2)->error_info;
+ }
+
+ // Wait till all rows are finished.
+ for (int i = num_workers - 1; i > 0; --i) {
+ AVxWorker *const worker = &workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((LFWorkerData *)worker->data2)->error_info;
+ }
+ }
+ if (had_error)
+ aom_internal_error(cm->error, error_info.error_code, "%s",
+ error_info.detail);
+}
+
// Row-based multi-threaded loopfilter hook
static int loop_filter_row_worker(void *arg1, void *arg2) {
AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
LFWorkerData *const lf_data = (LFWorkerData *)arg2;
AV1LfMTInfo *cur_job_info;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex_ = lf_sync->job_mutex;
+#endif
+
+ struct aom_internal_error_info *const error_info = &lf_data->error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(job_mutex_);
+ lf_sync->lf_mt_exit = true;
+ pthread_mutex_unlock(job_mutex_);
+#endif
+ av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
const int lpf_opt_level = cur_job_info->lpf_opt_level;
av1_thread_loop_filter_rows(
lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
- lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf,
- MAX_MIB_SIZE_LOG2);
+ lpf_opt_level, lf_sync, error_info, lf_data->params_buf,
+ lf_data->tx_buf, MAX_MIB_SIZE_LOG2);
}
+ error_info->setjmp = 0;
return 1;
}
static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int start, int stop,
- const int planes_to_lf[3], AVxWorker *workers,
- int num_workers, AV1LfSync *lf_sync,
- int lpf_opt_level) {
+ const int planes_to_lf[MAX_MB_PLANE],
+ AVxWorker *workers, int num_workers,
+ AV1LfSync *lf_sync, int lpf_opt_level) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
int i;
loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync,
@@ -359,6 +440,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
loop_filter_data_reset(lf_data, frame, cm, xd);
// Start loopfiltering
+ worker->had_error = 0;
if (i == 0) {
winterface->execute(worker);
} else {
@@ -366,15 +448,13 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
}
}
- // Wait till all rows are finished
- for (i = 1; i < num_workers; ++i) {
- winterface->sync(&workers[i]);
- }
+ sync_lf_workers(workers, cm, num_workers);
}
static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int start, int stop,
- const int planes_to_lf[3], int lpf_opt_level) {
+ const int planes_to_lf[MAX_MB_PLANE],
+ int lpf_opt_level) {
// Filter top rows of all planes first, in case the output can be partially
// reconstructed row by row.
int mi_row, plane, dir;
@@ -382,7 +462,7 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
TX_SIZE tx_buf[MAX_MIB_SIZE];
for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
- for (plane = 0; plane < 3; ++plane) {
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
continue;
}
@@ -390,7 +470,8 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
for (dir = 0; dir < 2; ++dir) {
av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane,
dir, lpf_opt_level, /*lf_sync=*/NULL,
- params_buf, tx_buf, MAX_MIB_SIZE_LOG2);
+ xd->error_info, params_buf, tx_buf,
+ MAX_MIB_SIZE_LOG2);
}
}
}
@@ -402,7 +483,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int num_workers, AV1LfSync *lf_sync,
int lpf_opt_level) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
- int planes_to_lf[3];
+ int planes_to_lf[MAX_MB_PLANE];
if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start,
plane_end))
@@ -536,6 +617,7 @@ void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
}
lr_sync->num_workers = num_workers;
+ lr_sync->lr_mt_exit = false;
for (int j = 0; j < num_planes; j++) {
CHECK_MEM_ERROR(
@@ -611,7 +693,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
for (int plane = 0; plane < num_planes; plane++) {
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
num_even_lr_jobs =
- num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+ num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1);
}
lr_job_counter[0] = 0;
lr_job_counter[1] = num_even_lr_jobs;
@@ -620,26 +702,23 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
const int is_uv = plane > 0;
const int ss_y = is_uv && cm->seq_params->subsampling_y;
-
- PixelRect tile_rect = ctxt[plane].tile_rect;
const int unit_size = ctxt[plane].rsi->restoration_unit_size;
-
- const int tile_h = tile_rect.bottom - tile_rect.top;
+ const int plane_h = ctxt[plane].plane_h;
const int ext_size = unit_size * 3 / 2;
int y0 = 0, i = 0;
- while (y0 < tile_h) {
- int remaining_h = tile_h - y0;
+ while (y0 < plane_h) {
+ int remaining_h = plane_h - y0;
int h = (remaining_h < ext_size) ? remaining_h : unit_size;
RestorationTileLimits limits;
- limits.v_start = tile_rect.top + y0;
- limits.v_end = tile_rect.top + y0 + h;
- assert(limits.v_end <= tile_rect.bottom);
- // Offset the tile upwards to align with the restoration processing stripe
+ limits.v_start = y0;
+ limits.v_end = y0 + h;
+ assert(limits.v_end <= plane_h);
+ // Offset upwards to align with the restoration processing stripe
const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
- limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
- if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
+ limits.v_start = AOMMAX(0, limits.v_start - voffset);
+ if (limits.v_end < plane_h) limits.v_end -= voffset;
assert(lr_job_counter[0] <= num_even_lr_jobs);
@@ -654,18 +733,18 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
limits.v_end - RESTORATION_BORDER;
if (i == 0) {
- assert(limits.v_start == tile_rect.top);
- lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+ assert(limits.v_start == 0);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0;
}
- if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
- assert(limits.v_end == tile_rect.bottom);
- lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+ if (i == (ctxt[plane].rsi->vert_units - 1)) {
+ assert(limits.v_end == plane_h);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h;
}
} else {
lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
- AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+ AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
- AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+ AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h);
}
lr_job_counter[i & 1]++;
lr_sync->jobs_enqueued++;
@@ -682,7 +761,7 @@ static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
#if CONFIG_MULTITHREAD
pthread_mutex_lock(lr_sync->job_mutex);
- if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+ if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
lr_sync->jobs_dequeued++;
}
@@ -695,6 +774,26 @@ static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
return cur_job_info;
}
+static void set_loop_restoration_done(AV1LrSync *const lr_sync,
+ FilterFrameCtxt *const ctxt) {
+ for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ if (ctxt[plane].rsi->frame_restoration_type == RESTORE_NONE) continue;
+ int y0 = 0, row_number = 0;
+ const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+ const int plane_h = ctxt[plane].plane_h;
+ const int ext_size = unit_size * 3 / 2;
+ const int hnum_rest_units = ctxt[plane].rsi->horz_units;
+ while (y0 < plane_h) {
+ const int remaining_h = plane_h - y0;
+ const int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+ lr_sync_write(lr_sync, row_number, hnum_rest_units - 1, hnum_rest_units,
+ plane);
+ y0 += h;
+ ++row_number;
+ }
+ }
+}
+
// Implement row loop restoration for each thread.
static int loop_restoration_row_worker(void *arg1, void *arg2) {
AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
@@ -703,16 +802,39 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
int lr_unit_row;
int plane;
- const int tile_row = LR_TILE_ROW;
- const int tile_col = LR_TILE_COL;
- const int tile_cols = LR_TILE_COLS;
- const int tile_idx = tile_col + tile_row * tile_cols;
+ int plane_w;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex_ = lr_sync->job_mutex;
+#endif
+ struct aom_internal_error_info *const error_info = &lrworkerdata->error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(job_mutex_);
+ lr_sync->lr_mt_exit = true;
+ pthread_mutex_unlock(job_mutex_);
+#endif
+ // In case of loop restoration multithreading, the worker on an even lr
+ // block row waits for the completion of the filtering of the top-right and
+ // bottom-right blocks. Hence, in case a thread (main/worker) encounters an
+ // error, update that filtering of every row in the frame is complete in
+ // order to avoid the dependent workers from waiting indefinitely.
+ set_loop_restoration_done(lr_sync, lr_ctxt->ctxt);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
int vstart, int vend);
- static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
- aom_yv12_partial_coloc_copy_u,
- aom_yv12_partial_coloc_copy_v };
+ static const copy_fun copy_funs[MAX_MB_PLANE] = {
+ aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u,
+ aom_yv12_partial_coloc_copy_v
+ };
while (1) {
AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
@@ -724,7 +846,7 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
limits.v_end = cur_job_info->v_end;
lr_unit_row = cur_job_info->lr_unit_row;
plane = cur_job_info->plane;
- const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
+ plane_w = ctxt[plane].plane_w;
// sync_mode == 1 implies only sync read is required in LR Multi-threading
// sync_mode == 0 implies only sync write is required.
@@ -734,16 +856,14 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
: av1_lr_sync_write_dummy;
av1_foreach_rest_unit_in_row(
- &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
- ctxt[plane].rsi->restoration_unit_size, unit_idx0,
- ctxt[plane].rsi->horz_units_per_tile,
- ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+ &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row,
+ ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units,
+ ctxt[plane].rsi->vert_units, plane, &ctxt[plane],
lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
- on_sync_write, lr_sync);
+ on_sync_write, lr_sync, error_info);
- copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
- ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
- cur_job_info->v_copy_end);
+ copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w,
+ cur_job_info->v_copy_start, cur_job_info->v_copy_end);
if (lrworkerdata->do_extend_border) {
aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane,
@@ -754,11 +874,37 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
break;
}
}
+ error_info->setjmp = 0;
return 1;
}
+static AOM_INLINE void sync_lr_workers(AVxWorker *const workers,
+ AV1_COMMON *const cm, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int had_error = workers[0].had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ AVxWorker *const worker = &workers[0];
+ error_info = ((LRWorkerData *)worker->data2)->error_info;
+ }
+
+ // Wait till all rows are finished.
+ for (int i = num_workers - 1; i > 0; --i) {
+ AVxWorker *const worker = &workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((LRWorkerData *)worker->data2)->error_info;
+ }
+ }
+ if (had_error)
+ aom_internal_error(cm->error, error_info.error_code, "%s",
+ error_info.detail);
+}
+
static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
- AVxWorker *workers, int nworkers,
+ AVxWorker *workers, int num_workers,
AV1LrSync *lr_sync, AV1_COMMON *cm,
int do_extend_border) {
FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
@@ -771,16 +917,12 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
for (int plane = 0; plane < num_planes; plane++) {
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
- const PixelRect tile_rect = ctxt[plane].tile_rect;
- const int max_tile_h = tile_rect.bottom - tile_rect.top;
-
+ const int plane_h = ctxt[plane].plane_h;
const int unit_size = cm->rst_info[plane].restoration_unit_size;
- num_rows_lr =
- AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
+ num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h));
}
- const int num_workers = nworkers;
int i;
assert(MAX_MB_PLANE == 3);
@@ -809,6 +951,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
worker->data2 = &lr_sync->lrworkerdata[i];
// Start loop restoration
+ worker->had_error = 0;
if (i == 0) {
winterface->execute(worker);
} else {
@@ -816,10 +959,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
}
}
- // Wait till all rows are finished
- for (i = 1; i < num_workers; ++i) {
- winterface->sync(&workers[i]);
- }
+ sync_lr_workers(workers, cm, num_workers);
}
void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
@@ -852,6 +992,7 @@ static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers,
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &workers[i];
+ worker->had_error = 0;
if (i == 0)
winterface->execute(worker);
else
@@ -863,16 +1004,26 @@ static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers,
AV1_COMMON *const cm,
int num_workers) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- int had_error = 0;
+ int had_error = workers[0].had_error;
+ struct aom_internal_error_info error_info;
- // Wait for completion of Cdef frame.
- for (int i = num_workers - 1; i > 0; i--) {
+ // Read the error_info of main thread.
+ if (had_error) {
+ AVxWorker *const worker = &workers[0];
+ error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
+ }
+
+ // Wait till all rows are finished.
+ for (int i = num_workers - 1; i > 0; --i) {
AVxWorker *const worker = &workers[i];
- had_error |= !winterface->sync(worker);
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
+ }
}
if (had_error)
- aom_internal_error(cm->error, AOM_CODEC_ERROR,
- "Failed to process cdef frame");
+ aom_internal_error(cm->error, error_info.error_code, "%s",
+ error_info.detail);
}
// Updates the row index of the next job to be processed.
@@ -888,14 +1039,15 @@ static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync,
// Checks if a job is available. If job is available,
// populates next job information and returns 1, else returns 0.
static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
- int *cur_fbr, const int nvfb) {
+ volatile int *cur_fbr,
+ const int nvfb) {
#if CONFIG_MULTITHREAD
pthread_mutex_lock(cdef_sync->mutex_);
#endif // CONFIG_MULTITHREAD
int do_next_row = 0;
// Populates information needed for current job and update the row
// index of the next row to be processed.
- if (cdef_sync->end_of_frame == 0) {
+ if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
do_next_row = 1;
*cur_fbr = cdef_sync->fbr;
update_cdef_row_next_job_info(cdef_sync, nvfb);
@@ -906,19 +1058,49 @@ static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
return do_next_row;
}
+static void set_cdef_init_fb_row_done(AV1CdefSync *const cdef_sync, int nvfb) {
+ for (int fbr = 0; fbr < nvfb; fbr++) cdef_row_mt_sync_write(cdef_sync, fbr);
+}
+
// Hook function for each thread in CDEF multi-threading.
static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
AV1_COMMON *cm = cdef_worker->cm;
const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
- int cur_fbr;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex_ = cdef_sync->mutex_;
+#endif
+ struct aom_internal_error_info *const error_info = &cdef_worker->error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(job_mutex_);
+ cdef_sync->cdef_mt_exit = true;
+ pthread_mutex_unlock(job_mutex_);
+#endif
+ // In case of cdef row-multithreading, the worker on a filter block row
+ // (fbr) waits for the line buffers (top and bottom) copy of the above row.
+ // Hence, in case a thread (main/worker) encounters an error before copying
+ // of the line buffers, update that line buffer copy is complete in order to
+ // avoid dependent workers waiting indefinitely.
+ set_cdef_init_fb_row_done(cdef_sync, nvfb);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ volatile int cur_fbr;
const int num_planes = av1_num_planes(cm);
while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
MACROBLOCKD *xd = cdef_worker->xd;
av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf,
cdef_worker->srcbuf, cur_fbr,
- cdef_worker->cdef_init_fb_row_fn, cdef_sync);
+ cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info);
if (cdef_worker->do_extend_border) {
for (int plane = 0; plane < num_planes; ++plane) {
const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
@@ -932,6 +1114,7 @@ static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
}
}
}
+ error_info->setjmp = 0;
return 1;
}
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index b6485c365..6d695e8d0 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -54,6 +54,10 @@ typedef struct AV1LfSyncData {
AV1LfMTInfo *job_queue;
int jobs_enqueued;
int jobs_dequeued;
+
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool lf_mt_exit;
} AV1LfSync;
typedef struct AV1LrMTInfo {
@@ -71,6 +75,7 @@ typedef struct LoopRestorationWorkerData {
void *rlbs;
void *lr_ctxt;
int do_extend_border;
+ struct aom_internal_error_info error_info;
} LRWorkerData;
// Looprestoration row synchronization
@@ -98,6 +103,9 @@ typedef struct AV1LrSyncData {
AV1LrMTInfo *job_queue;
int jobs_enqueued;
int jobs_dequeued;
+ // Initialized to false, set to true by the worker thread that encounters
+ // an error in order to abort the processing of other worker threads.
+ bool lr_mt_exit;
} AV1LrSync;
typedef struct AV1CdefWorker {
@@ -108,6 +116,7 @@ typedef struct AV1CdefWorker {
uint16_t *linebuf[MAX_MB_PLANE];
cdef_init_fb_row_t cdef_init_fb_row_fn;
int do_extend_border;
+ struct aom_internal_error_info error_info;
} AV1CdefWorkerData;
typedef struct AV1CdefRowSync {
@@ -132,6 +141,9 @@ typedef struct AV1CdefSyncData {
int fbr;
// Column index in units of 64x64 block
int fbc;
+ // Initialized to false, set to true by the worker thread that encounters
+ // an error in order to abort the processing of other worker threads.
+ bool cdef_mt_exit;
} AV1CdefSync;
void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
@@ -164,6 +176,9 @@ void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
int width, int num_workers);
+void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync,
+ int num_mis_in_lpf_unit_height_log2);
+
void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
struct macroblockd *xd, int plane_start,
int plane_end, int partial_frame,
@@ -185,11 +200,11 @@ void av1_thread_loop_filter_rows(
const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+ struct aom_internal_error_info *error_info,
AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int mib_size_log2);
-static AOM_FORCE_INLINE bool skip_loop_filter_plane(const int planes_to_lf[3],
- int plane,
- int lpf_opt_level) {
+static AOM_FORCE_INLINE bool skip_loop_filter_plane(
+ const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) {
// If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
// chroma planes together
if (lpf_opt_level == 2) {
@@ -212,7 +227,7 @@ static AOM_FORCE_INLINE bool skip_loop_filter_plane(const int planes_to_lf[3],
}
static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
- const int planes_to_lf[3],
+ const int planes_to_lf[MAX_MB_PLANE],
int lpf_opt_level,
int num_mis_in_lpf_unit_height) {
int mi_row, plane, dir;
@@ -225,7 +240,7 @@ static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
// partially reconstructed row by row.
for (dir = 0; dir < 2; ++dir) {
for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) {
- for (plane = 0; plane < 3; ++plane) {
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
continue;
}
@@ -242,9 +257,9 @@ static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
}
static AOM_INLINE void loop_filter_frame_mt_init(
- AV1_COMMON *cm, int start_mi_row, int end_mi_row, const int planes_to_lf[3],
- int num_workers, AV1LfSync *lf_sync, int lpf_opt_level,
- int num_mis_in_lpf_unit_height_log2) {
+ AV1_COMMON *cm, int start_mi_row, int end_mi_row,
+ const int planes_to_lf[MAX_MB_PLANE], int num_workers, AV1LfSync *lf_sync,
+ int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) {
// Number of superblock rows
const int sb_rows =
CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
@@ -271,7 +286,7 @@ static AOM_INLINE AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
#if CONFIG_MULTITHREAD
pthread_mutex_lock(lf_sync->job_mutex);
- if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+ if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
lf_sync->jobs_dequeued++;
}
diff --git a/av1/common/tile_common.c b/av1/common/tile_common.c
index 508fe30e7..b964f259b 100644
--- a/av1/common/tile_common.c
+++ b/av1/common/tile_common.c
@@ -167,12 +167,12 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
assert(tile->mi_col_end > tile->mi_col_start);
}
-int av1_get_sb_rows_in_tile(AV1_COMMON *cm, const TileInfo *tile) {
+int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start,
cm->seq_params->mib_size_log2);
}
-int av1_get_sb_cols_in_tile(AV1_COMMON *cm, const TileInfo *tile) {
+int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start,
cm->seq_params->mib_size_log2);
}
diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h
index 8615a2c2d..5383ae940 100644
--- a/av1/common/tile_common.h
+++ b/av1/common/tile_common.h
@@ -40,8 +40,8 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
-int av1_get_sb_rows_in_tile(struct AV1Common *cm, const TileInfo *tile);
-int av1_get_sb_cols_in_tile(struct AV1Common *cm, const TileInfo *tile);
+int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile);
+int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile);
// Return the pixel extents of the given tile
PixelRect av1_get_tile_rect(const TileInfo *tile_info,
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 83f410e9f..f376e1674 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -17,6 +17,7 @@
#include "config/av1_rtcd.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/warped_motion.h"
#include "av1/common/scale.h"
@@ -214,10 +215,42 @@ static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
return 1;
}
+#ifndef NDEBUG
+// Check that the given warp model satisfies the relevant constraints for
+// its stated model type
+static void check_model_consistency(WarpedMotionParams *wm) {
+ switch (wm->wmtype) {
+ case IDENTITY:
+ assert(wm->wmmat[0] == 0);
+ assert(wm->wmmat[1] == 0);
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
+ assert(wm->wmmat[3] == 0);
+ AOM_FALLTHROUGH_INTENDED;
+ case ROTZOOM:
+ assert(wm->wmmat[4] == -wm->wmmat[3]);
+ assert(wm->wmmat[5] == wm->wmmat[2]);
+ AOM_FALLTHROUGH_INTENDED;
+ case AFFINE: break;
+ default: assert(0 && "Bad wmtype");
+ }
+}
+#endif // NDEBUG
+
// Returns 1 on success or 0 on an invalid affine set
int av1_get_shear_params(WarpedMotionParams *wm) {
+#ifndef NDEBUG
+ // Check that models have been constructed sensibly
+ // This is a good place to check, because this function does not need to
+ // be called until after model construction is complete, but must be called
+ // before the model can be used for prediction.
+ check_model_consistency(wm);
+#endif // NDEBUG
+
const int32_t *mat = wm->wmmat;
if (!is_affine_valid(wm)) return 0;
+
wm->alpha =
clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
@@ -247,17 +280,6 @@ int av1_get_shear_params(WarpedMotionParams *wm) {
}
#if CONFIG_AV1_HIGHBITDEPTH
-static INLINE int highbd_error_measure(int err, int bd) {
- const int b = bd - 8;
- const int bmask = (1 << b) - 1;
- const int v = (1 << b);
- err = abs(err);
- const int e1 = err >> b;
- const int e2 = err & bmask;
- return error_measure_lut[255 + e1] * (v - e2) +
- error_measure_lut[256 + e1] * e2;
-}
-
/* Note: For an explanation of the warp algorithm, and some notes on bit widths
for hardware implementations, see the comments above av1_warp_affine_c
*/
@@ -392,11 +414,6 @@ void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
int p_col, int p_row, int p_width, int p_height,
int p_stride, int subsampling_x, int subsampling_y,
int bd, ConvolveParams *conv_params) {
- assert(wm->wmtype <= AFFINE);
- if (wm->wmtype == ROTZOOM) {
- wm->wmmat[5] = wm->wmmat[2];
- wm->wmmat[4] = -wm->wmmat[3];
- }
const int32_t *const mat = wm->wmmat;
const int16_t alpha = wm->alpha;
const int16_t beta = wm->beta;
@@ -408,46 +425,6 @@ void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
subsampling_y, bd, conv_params, alpha, beta, gamma,
delta);
}
-
-int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
- const uint16_t *const dst, int p_width,
- int p_height, int p_stride, int bd) {
- int64_t sum_error = 0;
- for (int i = 0; i < p_height; ++i) {
- for (int j = 0; j < p_width; ++j) {
- sum_error +=
- highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
- }
- }
- return sum_error;
-}
-
-static int64_t highbd_segmented_frame_error(
- const uint16_t *const ref, int stride, const uint16_t *const dst,
- int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
- int segment_map_stride) {
- int patch_w, patch_h;
- const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
- const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
- int64_t sum_error = 0;
- for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
- for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
- int seg_x = j >> WARP_ERROR_BLOCK_LOG;
- int seg_y = i >> WARP_ERROR_BLOCK_LOG;
- // Only compute the error if this block contains inliers from the motion
- // model
- if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
-
- // avoid computing error into the frame padding
- patch_w = AOMMIN(error_bsize_w, p_width - j);
- patch_h = AOMMIN(error_bsize_h, p_height - i);
- sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
- dst + j + i * p_stride, patch_w,
- patch_h, p_stride, bd);
- }
- }
- return sum_error;
-}
#endif // CONFIG_AV1_HIGHBITDEPTH
/* The warp filter for ROTZOOM and AFFINE models works as follows:
@@ -669,11 +646,6 @@ void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
int height, int stride, uint8_t *pred, int p_col, int p_row,
int p_width, int p_height, int p_stride, int subsampling_x,
int subsampling_y, ConvolveParams *conv_params) {
- assert(wm->wmtype <= AFFINE);
- if (wm->wmtype == ROTZOOM) {
- wm->wmmat[5] = wm->wmmat[2];
- wm->wmmat[4] = -wm->wmmat[3];
- }
const int32_t *const mat = wm->wmmat;
const int16_t alpha = wm->alpha;
const int16_t beta = wm->beta;
@@ -684,79 +656,6 @@ void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
alpha, beta, gamma, delta);
}
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
- const uint8_t *const dst, int p_width,
- int p_height, int p_stride) {
- int64_t sum_error = 0;
- for (int i = 0; i < p_height; ++i) {
- for (int j = 0; j < p_width; ++j) {
- sum_error +=
- (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
- }
- }
- return sum_error;
-}
-
-static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
- const uint8_t *const dst, int p_width,
- int p_height, int p_stride,
- uint8_t *segment_map,
- int segment_map_stride) {
- int patch_w, patch_h;
- const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
- const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
- int64_t sum_error = 0;
- for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
- for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
- int seg_x = j >> WARP_ERROR_BLOCK_LOG;
- int seg_y = i >> WARP_ERROR_BLOCK_LOG;
- // Only compute the error if this block contains inliers from the motion
- // model
- if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
-
- // avoid computing error into the frame padding
- patch_w = AOMMIN(error_bsize_w, p_width - j);
- patch_h = AOMMIN(error_bsize_h, p_height - i);
- sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
- dst + j + i * p_stride, patch_w,
- patch_h, p_stride);
- }
- }
- return sum_error;
-}
-
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
- uint8_t *dst, int p_width, int p_height, int p_stride) {
-#if CONFIG_AV1_HIGHBITDEPTH
- if (use_hbd) {
- return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
- CONVERT_TO_SHORTPTR(dst), p_width,
- p_height, p_stride, bd);
- }
-#endif
- (void)use_hbd;
- (void)bd;
- return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
-}
-
-int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
- int stride, uint8_t *dst, int p_width,
- int p_height, int p_stride,
- uint8_t *segment_map,
- int segment_map_stride) {
-#if CONFIG_AV1_HIGHBITDEPTH
- if (use_hbd) {
- return highbd_segmented_frame_error(
- CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
- p_height, p_stride, bd, segment_map, segment_map_stride);
- }
-#endif
- (void)use_hbd;
- (void)bd;
- return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
- segment_map, segment_map_stride);
-}
-
void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
const uint8_t *ref, int width, int height, int stride,
uint8_t *pred, int p_col, int p_row, int p_width,
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index d6fe325e0..d772df887 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -38,76 +38,6 @@ extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
DECLARE_ALIGNED(8, extern const int8_t,
av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
-/* clang-format off */
-static const int error_measure_lut[512] = {
- // pow 0.7
- 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
- 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
- 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
- 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
- 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
- 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
- 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
- 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
- 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
- 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
- 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
- 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
- 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
- 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
- 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
- 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
- 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
- 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
- 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
- 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
- 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
- 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
- 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
- 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
- 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
- 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
- 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
- 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
- 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
- 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
- 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
- 1323, 1187, 1045, 894, 731, 550, 339, 0,
- 339, 550, 731, 894, 1045, 1187, 1323, 1452,
- 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
- 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
- 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
- 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
- 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
- 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
- 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
- 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
- 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
- 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
- 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
- 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
- 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
- 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
- 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
- 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
- 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
- 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
- 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
- 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
- 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
- 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
- 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
- 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
- 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
- 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
- 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
- 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
- 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
- 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
- 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
-};
-/* clang-format on */
-
static const uint8_t warp_pad_left[14][16] = {
{ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -142,24 +72,6 @@ static const uint8_t warp_pad_right[14][16] = {
{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
};
-static INLINE int error_measure(int err) {
- return error_measure_lut[255 + err];
-}
-
-// Returns the error between the frame described by 'ref' and the frame
-// described by 'dst'.
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
- uint8_t *dst, int p_width, int p_height, int p_stride);
-
-int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
- int stride, uint8_t *dst, int p_width,
- int p_height, int p_stride,
- uint8_t *segment_map, int segment_map_stride);
-
-int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
- const uint16_t *const dst, int p_width,
- int p_height, int p_stride, int bd);
-
void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
int width, int height, int stride, uint16_t *const pred,
int p_col, int p_row, int p_width, int p_height,
diff --git a/av1/common/x86/highbd_wiener_convolve_avx2.c b/av1/common/x86/highbd_wiener_convolve_avx2.c
index 0c8a8505b..ea8b35bde 100644
--- a/av1/common/x86/highbd_wiener_convolve_avx2.c
+++ b/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -29,7 +29,7 @@ void av1_highbd_wiener_convolve_add_src_avx2(
const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h,
- const ConvolveParams *conv_params, int bd) {
+ const WienerConvolveParams *conv_params, int bd) {
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
diff --git a/av1/common/x86/highbd_wiener_convolve_ssse3.c b/av1/common/x86/highbd_wiener_convolve_ssse3.c
index 818b1099c..1c884741f 100644
--- a/av1/common/x86/highbd_wiener_convolve_ssse3.c
+++ b/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -22,7 +22,7 @@ void av1_highbd_wiener_convolve_add_src_ssse3(
const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h,
- const ConvolveParams *conv_params, int bd) {
+ const WienerConvolveParams *conv_params, int bd) {
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
diff --git a/av1/common/x86/intra_edge_sse4.c b/av1/common/x86/intra_edge_sse4.c
index f025f7917..3eee46fae 100644
--- a/av1/common/x86/intra_edge_sse4.c
+++ b/av1/common/x86/intra_edge_sse4.c
@@ -113,7 +113,69 @@ void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
}
}
-void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+ // interpolate half-sample positions
+ assert(sz <= 24);
+
+ DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+ { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+ };
+
+ DECLARE_ALIGNED(
+ 16, static const int8_t,
+ v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
+
+ // Extend first/last samples (upper-left p[-1], last p[sz-1])
+ // to support 4-tap filter
+ p[-2] = p[-1];
+ p[sz] = p[sz - 1];
+
+ uint8_t *in = &p[-2];
+ uint8_t *out = &p[-2];
+
+ int n = sz + 1; // Input length including upper-left sample
+
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+ __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+ __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+ while (n > 0) {
+ __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+ __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+ __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d2 = _mm_maddubs_epi16(d2, coef0);
+ d3 = _mm_maddubs_epi16(d3, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ d2 = _mm_hadd_epi16(d2, d3);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d2 = _mm_add_epi16(d2, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d2 = _mm_srai_epi16(d2, 4);
+ d0 = _mm_packus_epi16(d0, d2);
+ __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+ __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+ __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+ _mm_storeu_si128((__m128i *)&out[0], out0);
+ _mm_storeu_si128((__m128i *)&out[16], out1);
+ in0 = in16;
+ in16 = _mm_setzero_si128();
+ out += 32;
+ n -= 16;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
if (!strength) return;
DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
@@ -204,67 +266,7 @@ void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
}
}
-void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
- // interpolate half-sample positions
- assert(sz <= 24);
-
- DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
- { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
- };
-
- DECLARE_ALIGNED(
- 16, static const int8_t,
- v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
- { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
-
- // Extend first/last samples (upper-left p[-1], last p[sz-1])
- // to support 4-tap filter
- p[-2] = p[-1];
- p[sz] = p[sz - 1];
-
- uint8_t *in = &p[-2];
- uint8_t *out = &p[-2];
-
- int n = sz + 1; // Input length including upper-left sample
-
- __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
- __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
-
- __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
- __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
- __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
-
- while (n > 0) {
- __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
- __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
- __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
- __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
- __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
- d0 = _mm_maddubs_epi16(d0, coef0);
- d1 = _mm_maddubs_epi16(d1, coef0);
- d2 = _mm_maddubs_epi16(d2, coef0);
- d3 = _mm_maddubs_epi16(d3, coef0);
- d0 = _mm_hadd_epi16(d0, d1);
- d2 = _mm_hadd_epi16(d2, d3);
- __m128i eight = _mm_set1_epi16(8);
- d0 = _mm_add_epi16(d0, eight);
- d2 = _mm_add_epi16(d2, eight);
- d0 = _mm_srai_epi16(d0, 4);
- d2 = _mm_srai_epi16(d2, 4);
- d0 = _mm_packus_epi16(d0, d2);
- __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
- __m128i out0 = _mm_unpacklo_epi8(in1, d0);
- __m128i out1 = _mm_unpackhi_epi8(in1, d0);
- _mm_storeu_si128((__m128i *)&out[0], out0);
- _mm_storeu_si128((__m128i *)&out[16], out1);
- in0 = in16;
- in16 = _mm_setzero_si128();
- out += 32;
- n -= 16;
- }
-}
-
-void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
+void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
// interpolate half-sample positions
assert(sz <= 24);
@@ -316,3 +318,5 @@ void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
n -= 8;
}
}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 4ab35e808..5ab6c46f8 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -630,18 +630,17 @@ int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
return 0;
}
-void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
const int ret = av1_selfguided_restoration_avx2(
dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
- (void)ret;
- assert(!ret);
+ if (ret != 0) return ret;
const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
av1_decode_xq(xqd, xq, params);
@@ -721,4 +720,5 @@ void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
}
}
}
+ return 0;
}
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 948bbfbf0..ac850f569 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -582,18 +582,17 @@ int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
return 0;
}
-void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
const int ret = av1_selfguided_restoration_sse4_1(
dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
- (void)ret;
- assert(!ret);
+ if (ret != 0) return ret;
const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
av1_decode_xq(xqd, xq, params);
@@ -659,4 +658,5 @@ void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
}
}
}
+ return 0;
}
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index ceb836ea6..663b8cde9 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -1022,116 +1022,6 @@ static INLINE void prepare_warp_horizontal_filter_avx2(
shuffle_src);
}
-int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
- const uint8_t *const dst, int p_width,
- int p_height, int dst_stride) {
- int64_t sum_error = 0;
- int i, j;
- __m256i row_error, col_error;
- __m256i zero = _mm256_setzero_si256();
- __m256i dup_255 = _mm256_set1_epi16(255);
- col_error = zero;
-
- for (i = 0; i < (p_height / 4); i++) {
- row_error = _mm256_setzero_si256();
- for (j = 0; j < (p_width / 16); j++) {
- __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
- __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
- __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
- __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
- __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
- __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
- __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
- __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
- (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
-
- __m256i diff_1 =
- _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
- __m256i diff_2 =
- _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
- __m256i diff_3 =
- _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
- __m256i diff_4 =
- _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
-
- __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
- __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
- __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
- __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
- __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
- __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
- __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
- __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
-
- __m256i error_1_lo =
- _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
- __m256i error_1_hi =
- _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
- __m256i error_2_lo =
- _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
- __m256i error_2_hi =
- _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
- __m256i error_3_lo =
- _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
- __m256i error_3_hi =
- _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
- __m256i error_4_lo =
- _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
- __m256i error_4_hi =
- _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
-
- __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
- __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
- __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
- __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
-
- __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
- __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
-
- __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
- row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
- }
- __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
- __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
- __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
- col_error = _mm256_add_epi64(col_error, col_error_temp);
- // Error summation for remaining width, which is not multiple of 16
- if (p_width & 0xf) {
- for (int k = 0; k < 4; ++k) {
- for (int l = j * 16; l < p_width; ++l) {
- sum_error +=
- (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
- ref[l + ((i * 4) + k) * ref_stride]);
- }
- }
- }
- }
- __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
- __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
- sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
- int64_t sum_error_d_0, sum_error_d_1;
- xx_storel_64(&sum_error_d_0, sum_error_q_0);
- xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
- sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
- // Error summation for remaining height, which is not multiple of 4
- if (p_height & 0x3) {
- for (int k = i * 4; k < p_height; ++k) {
- for (int l = 0; l < p_width; ++l) {
- sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
- ref[l + k * ref_stride]);
- }
- }
- }
- return sum_error;
-}
-
void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
deleted file mode 100644
index f8fe578e9..000000000
--- a/av1/common/x86/warp_plane_sse2.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "av1/common/warped_motion.h"
-#include "config/av1_rtcd.h"
-
-int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
- const uint8_t *const dst, int p_width,
- int p_height, int dst_stride) {
- int64_t sum_error = 0;
- int i, j;
- __m128i row_error, col_error;
- __m128i zero = _mm_setzero_si128();
- __m128i dup_255 = _mm_set1_epi16(255);
- col_error = zero;
- for (i = 0; i < (p_height); i++) {
- row_error = zero;
- for (j = 0; j < (p_width / 16); j++) {
- __m128i ref_8 =
- _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
- __m128i dst_8 =
- _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
- __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
- __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
- __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
- __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
-
- __m128i diff_1 =
- _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
- __m128i diff_2 =
- _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
-
- __m128i error_1_lo =
- _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
- error_measure_lut[_mm_extract_epi16(diff_1, 2)],
- error_measure_lut[_mm_extract_epi16(diff_1, 1)],
- error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
- __m128i error_1_hi =
- _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
- error_measure_lut[_mm_extract_epi16(diff_1, 6)],
- error_measure_lut[_mm_extract_epi16(diff_1, 5)],
- error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
- __m128i error_2_lo =
- _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
- error_measure_lut[_mm_extract_epi16(diff_2, 2)],
- error_measure_lut[_mm_extract_epi16(diff_2, 1)],
- error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
- __m128i error_2_hi =
- _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
- error_measure_lut[_mm_extract_epi16(diff_2, 6)],
- error_measure_lut[_mm_extract_epi16(diff_2, 5)],
- error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
-
- __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
- __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
- __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
-
- row_error = _mm_add_epi32(row_error, error_1_2);
- }
- __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
- __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
- __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
- col_error = _mm_add_epi64(col_error, col_error_temp);
- // Error summation for remaining width, which is not multiple of 16
- if (p_width & 0xf) {
- for (int l = j * 16; l < p_width; ++l) {
- sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
- ref[l + i * ref_stride]);
- }
- }
- }
- int64_t sum_error_d_0, sum_error_d_1;
- xx_storel_64(&sum_error_d_0, col_error);
- xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
- sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
- return sum_error;
-}
diff --git a/av1/common/x86/wiener_convolve_avx2.c b/av1/common/x86/wiener_convolve_avx2.c
index b7ac68383..3de630f20 100644
--- a/av1/common/x86/wiener_convolve_avx2.c
+++ b/av1/common/x86/wiener_convolve_avx2.c
@@ -45,7 +45,7 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h,
- const ConvolveParams *conv_params) {
+ const WienerConvolveParams *conv_params) {
const int bd = 8;
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
diff --git a/av1/common/x86/wiener_convolve_sse2.c b/av1/common/x86/wiener_convolve_sse2.c
index f9d00b733..1c039e80c 100644
--- a/av1/common/x86/wiener_convolve_sse2.c
+++ b/av1/common/x86/wiener_convolve_sse2.c
@@ -23,7 +23,7 @@ void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h,
- const ConvolveParams *conv_params) {
+ const WienerConvolveParams *conv_params) {
const int bd = 8;
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 5b76de846..e3cce40c2 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1275,9 +1275,13 @@ static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
const int num_planes = av1_num_planes(cm);
for (int plane = 0; plane < num_planes; ++plane) {
int rcol0, rcol1, rrow0, rrow1;
+
+ // Skip some unnecessary work if loop restoration is disabled
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
&rcol0, &rcol1, &rrow0, &rrow1)) {
- const int rstride = cm->rst_info[plane].horz_units_per_tile;
+ const int rstride = cm->rst_info[plane].horz_units;
for (int rrow = rrow0; rrow < rrow1; ++rrow) {
for (int rcol = rcol0; rcol < rcol1; ++rcol) {
const int runit_idx = rcol + rrow * rstride;
@@ -4326,7 +4330,6 @@ static int read_global_motion_params(WarpedMotionParams *params,
trans_dec_factor;
}
- assert(params->wmtype <= AFFINE);
int good_shear_params = av1_get_shear_params(params);
if (!good_shear_params) return 0;
@@ -5219,6 +5222,9 @@ static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true);
+ for (int p = 0; p < av1_num_planes(cm); p++) {
+ av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+ }
}
const int use_highbd = cm->seq_params->use_highbitdepth;
@@ -5238,6 +5244,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
MACROBLOCKD *const xd = &pbi->dcb.xd;
const int tile_count_tg = end_tile - start_tile + 1;
+ xd->error_info = cm->error;
if (initialize_flag) setup_frame_info(pbi);
const int num_planes = av1_num_planes(cm);
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 5f114f9cb..bb0ccf5fd 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -311,7 +311,7 @@ static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
}
static int read_intra_segment_id(AV1_COMMON *const cm,
- const MACROBLOCKD *const xd, int bsize,
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
aom_reader *r, int skip) {
struct segmentation *const seg = &cm->seg;
if (!seg->enabled) return 0; // Default for disabled segmentation
@@ -825,13 +825,13 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
if (mbmi->uv_mode == UV_CFL_PRED) {
mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
}
+ const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
mbmi->angle_delta[PLANE_TYPE_UV] =
- (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)))
- ? read_angle_delta(r,
- ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+ (use_angle_delta && av1_is_directional_mode(intra_mode))
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
: 0;
} else {
- // Avoid decoding angle_info if there is is no chroma prediction
+ // Avoid decoding angle_info if there is no chroma prediction
mbmi->uv_mode = UV_DC_PRED;
}
xd->cfl.store_y = store_cfl_required(cm, xd);
@@ -1086,13 +1086,13 @@ static void read_intra_block_mode_info(AV1_COMMON *const cm,
mbmi->cfl_alpha_idx =
read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
}
+ const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
mbmi->angle_delta[PLANE_TYPE_UV] =
- use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))
- ? read_angle_delta(r,
- ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+ use_angle_delta && av1_is_directional_mode(intra_mode)
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
: 0;
} else {
- // Avoid decoding angle_info if there is is no chroma prediction
+ // Avoid decoding angle_info if there is no chroma prediction
mbmi->uv_mode = UV_DC_PRED;
}
xd->cfl.store_y = store_cfl_required(cm, xd);
diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index 236b296cf..a59d0d710 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c
@@ -29,6 +29,26 @@
#include "av1/encoder/model_rd.h"
#include "av1/encoder/rdopt_utils.h"
+#define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128
+#define MB_WIENER_PRED_BUF_STRIDE 128
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) {
+ const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd);
+ assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL);
+ const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE];
+ const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE];
+ assert(buf_width == MB_WIENER_PRED_BUF_STRIDE);
+ const size_t buf_size =
+ (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf))
+ << is_high_bitdepth;
+ CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size));
+}
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) {
+ aom_free(td->wiener_tmp_pred_buf);
+ td->wiener_tmp_pred_buf = NULL;
+}
+
void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
AV1_COMMON *cm = &cpi->common;
@@ -236,7 +256,7 @@ void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
int16_t *src_diff, tran_low_t *coeff,
tran_low_t *qcoeff, tran_low_t *dqcoeff,
double *sum_rec_distortion,
- double *sum_est_rate) {
+ double *sum_est_rate, uint8_t *pred_buffer) {
AV1_COMMON *const cm = &cpi->common;
uint8_t *buffer = cpi->source->y_buffer;
int buf_stride = cpi->source->y_stride;
@@ -261,16 +281,9 @@ void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
int mt_unit_col = 0;
const int is_high_bitdepth = is_cur_buf_hbd(xd);
- // We use a scratch buffer to store the prediction.
- // The stride is the max block size (128).
- uint8_t *pred_buffer;
- const int dst_buffer_stride = 128;
- const int buf_width = 128;
- const int buf_height = 128;
- const size_t buf_size = (buf_width * buf_height * sizeof(*pred_buffer))
- << is_high_bitdepth;
- CHECK_MEM_ERROR(cm, pred_buffer, aom_memalign(32, buf_size));
uint8_t *dst_buffer = pred_buffer;
+ const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE;
+
if (is_high_bitdepth) {
uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer;
dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16);
@@ -434,7 +447,6 @@ void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
}
// Set the pointer to null since mbmi is only allocated inside this function.
xd->mi = NULL;
- aom_free(pred_buffer);
}
static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
@@ -449,7 +461,8 @@ static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff,
- dqcoeff, sum_rec_distortion, sum_est_rate);
+ dqcoeff, sum_rec_distortion, sum_est_rate,
+ cpi->td.wiener_tmp_pred_buf);
}
}
@@ -565,6 +578,7 @@ void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
NULL, cpi->image_pyramid_levels, 0))
aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
+ av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td);
cpi->norm_wiener_variance = 0;
MACROBLOCK *x = &cpi->td.mb;
@@ -647,6 +661,7 @@ void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
// Set the pointer to null since mbmi is only allocated inside this function.
xd->mi = NULL;
aom_free_frame_buffer(&cm->cur_frame->buf);
+ av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
}
static int get_rate_guided_quantizer(AV1_COMP *const cpi, BLOCK_SIZE bsize,
diff --git a/av1/encoder/allintra_vis.h b/av1/encoder/allintra_vis.h
index 9e10566dd..ab39968a4 100644
--- a/av1/encoder/allintra_vis.h
+++ b/av1/encoder/allintra_vis.h
@@ -27,7 +27,7 @@ void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
int16_t *src_diff, tran_low_t *coeff,
tran_low_t *qcoeff, tran_low_t *dqcoeff,
double *sum_rec_distortion,
- double *sum_est_rate);
+ double *sum_est_rate, uint8_t *pred_buffer);
void av1_set_mb_wiener_variance(AV1_COMP *cpi);
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index be51ba14f..f48ff11e5 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -48,7 +48,8 @@ void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
// mode, and rate/distortion.
static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
const MB_MODE_INFO *mbmi, int64_t rate,
- int64_t dist, int bsize, int noise_level) {
+ int64_t dist, BLOCK_SIZE bsize,
+ int noise_level) {
MV mv = mbmi->mv[0].as_mv;
int is_compound = has_second_ref(mbmi);
// Reject the block for lower-qp coding for non-compound mode if
@@ -642,11 +643,15 @@ void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
- // TODO(marpan): Tune these conditons, add QP dependence.
- if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad) return 1;
+ const int qindex = cpi->common.quant_params.base_qindex;
if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 &&
cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh &&
- cpi->rc.frame_source_sad < 1000)
+ cpi->rc.frame_source_sad < 1000 &&
+ qindex < 7 * (cpi->rc.worst_quality >> 3))
+ return 1;
+ // More aggressive skip.
+ else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad &&
+ cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality)
return 1;
return 0;
}
diff --git a/av1/encoder/arm/crc32/hash_crc32.c b/av1/encoder/arm/crc32/hash_arm_crc32.c
index 771496c08..91fc1e00a 100644
--- a/av1/encoder/arm/crc32/hash_crc32.c
+++ b/av1/encoder/arm/crc32/hash_arm_crc32.c
@@ -9,9 +9,14 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include <stdint.h>
-#include <stddef.h>
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+#else
#include <arm_acle.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
#include "config/aom_config.h"
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index ee8b1156f..a17a41ad1 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -12,792 +12,597 @@
#include <arm_neon.h>
#include <assert.h>
-#include "aom_dsp/txfm_common.h"
#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_txfm.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+#define TXFM_COS_BIT_MAX 13
+
+// A note on butterfly helper naming:
+//
+// butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
+// e.g. butterfly_s32_s32_x4_0231_neon
+// | | | ^ Weights are applied as indices 0, 2, 3, 1
+// | | | (see more detail below)
+// | | ^ (int32)x4 input/output parameters
+// | ^ 32-bit accumulators internally
+// ^ 32-bit input/output parameters
+//
+// Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
+// avoid needing separate negation instructions. This is represented in the
+// helper naming by referring to the lane index in the loaded tuple that each
+// multiply is performed with:
+//
+// in0 in1
+// /----------
+// out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1
+// out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3
+//
+// So for indices 0331 from the earlier example, we end up with:
+//
+// in0 in1
+// /------------------
+// out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0
+// out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0)
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
-#define custom_packs_s32(w0, w1) vcombine_s16(vqmovn_s32(w0), vqmovn_s32(w1))
-
-static INLINE void transpose_16bit_4x4(const int16x8_t *const in,
- int16x8_t *const out) {
-#if AOM_ARCH_AARCH64
- const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
- const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
-#else
- int16x4x2_t temp;
- temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
- const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
- temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
- const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
-#endif
-
- int32x4x2_t a01 =
- vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
- out[0] = vreinterpretq_s16_s32(a01.val[0]);
- out[1] = vextq_s16(vreinterpretq_s16_s32(a01.val[0]), out[1], 4);
- out[2] = vreinterpretq_s16_s32(a01.val[1]);
- out[3] = vextq_s16(vreinterpretq_s16_s32(a01.val[1]), out[3], 4);
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
}
-static INLINE void transpose_16bit_4x8(const int16x8_t *const in,
- int16x8_t *const out) {
-#if AOM_ARCH_AARCH64
- const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
- const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
- const int16x8_t a2 = vzip1q_s16(in[4], in[5]);
- const int16x8_t a3 = vzip1q_s16(in[6], in[7]);
-#else
- int16x4x2_t temp;
- temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
- const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
- temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
- const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
- temp = vzip_s16(vget_low_s16(in[4]), vget_low_s16(in[5]));
- const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
- temp = vzip_s16(vget_low_s16(in[6]), vget_low_s16(in[7]));
- const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
-#endif
-
- const int32x4x2_t b02 =
- vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
- const int32x4x2_t b13 =
- vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
-
-#if AOM_ARCH_AARCH64
- out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
- vreinterpretq_s64_s32(b13.val[0])));
- out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
- vreinterpretq_s64_s32(b13.val[0])));
- out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
- vreinterpretq_s64_s32(b13.val[1])));
- out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
- vreinterpretq_s64_s32(b13.val[1])));
-#else
- out[0] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
- out[2] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
- out[1] = vreinterpretq_s16_s32(
- vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
- out[3] = vreinterpretq_s16_s32(
- vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
-#endif
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
}
-static INLINE void transpose_16bit_8x4(const int16x8_t *const in,
- int16x8_t *const out) {
- const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
- const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
-
- const int32x4x2_t b01 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
- vreinterpretq_s32_s16(a15.val[0]));
- const int32x4x2_t b45 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
- vreinterpretq_s32_s16(a15.val[1]));
-
- const int32x4_t zeros = vdupq_n_s32(0);
-
-#if AOM_ARCH_AARCH64
- out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[0]),
- vreinterpretq_s64_s32(zeros)));
- out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[0]),
- vreinterpretq_s64_s32(zeros)));
- out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[1]),
- vreinterpretq_s64_s32(zeros)));
- out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[1]),
- vreinterpretq_s64_s32(zeros)));
- out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[0]),
- vreinterpretq_s64_s32(zeros)));
- out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[0]),
- vreinterpretq_s64_s32(zeros)));
- out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[1]),
- vreinterpretq_s64_s32(zeros)));
- out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[1]),
- vreinterpretq_s64_s32(zeros)));
-#else
- out[0] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b01.val[0], b01.val[0], 2), zeros, 2));
- out[1] = vreinterpretq_s16_s32(vextq_s32(b01.val[0], zeros, 2));
- out[2] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b01.val[1], b01.val[1], 2), zeros, 2));
- out[3] = vreinterpretq_s16_s32(vextq_s32(b01.val[1], zeros, 2));
- out[4] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b45.val[0], b45.val[0], 2), zeros, 2));
- out[5] = vreinterpretq_s16_s32(vextq_s32(b45.val[0], zeros, 2));
- out[6] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b45.val[1], b45.val[1], 2), zeros, 2));
- out[7] = vreinterpretq_s16_s32(vextq_s32(b45.val[1], zeros, 2));
-#endif
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
}
-static INLINE void transpose_16bit_8x8(const int16x8_t *const in,
- int16x8_t *const out) {
- const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
- const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
- const int16x8x2_t a26 = vzipq_s16(in[4], in[5]);
- const int16x8x2_t a37 = vzipq_s16(in[6], in[7]);
-
- const int32x4x2_t b04 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
- vreinterpretq_s32_s16(a15.val[0]));
- const int32x4x2_t b15 = vzipq_s32(vreinterpretq_s32_s16(a26.val[0]),
- vreinterpretq_s32_s16(a37.val[0]));
- const int32x4x2_t b26 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
- vreinterpretq_s32_s16(a15.val[1]));
- const int32x4x2_t b37 = vzipq_s32(vreinterpretq_s32_s16(a26.val[1]),
- vreinterpretq_s32_s16(a37.val[1]));
-
-#if AOM_ARCH_AARCH64
- out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[0]),
- vreinterpretq_s64_s32(b15.val[0])));
- out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[0]),
- vreinterpretq_s64_s32(b15.val[0])));
- out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[1]),
- vreinterpretq_s64_s32(b15.val[1])));
- out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[1]),
- vreinterpretq_s64_s32(b15.val[1])));
- out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[0]),
- vreinterpretq_s64_s32(b37.val[0])));
- out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[0]),
- vreinterpretq_s64_s32(b37.val[0])));
- out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[1]),
- vreinterpretq_s64_s32(b37.val[1])));
- out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[1]),
- vreinterpretq_s64_s32(b37.val[1])));
-#else
- out[0] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b04.val[0], b04.val[0], 2), b15.val[0], 2));
- out[1] = vreinterpretq_s16_s32(
- vextq_s32(b04.val[0], vextq_s32(b15.val[0], b15.val[0], 2), 2));
- out[2] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b04.val[1], b04.val[1], 2), b15.val[1], 2));
- out[3] = vreinterpretq_s16_s32(
- vextq_s32(b04.val[1], vextq_s32(b15.val[1], b15.val[1], 2), 2));
- out[4] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b26.val[0], b26.val[0], 2), b37.val[0], 2));
- out[5] = vreinterpretq_s16_s32(
- vextq_s32(b26.val[0], vextq_s32(b37.val[0], b37.val[0], 2), 2));
- out[6] = vreinterpretq_s16_s32(
- vextq_s32(vextq_s32(b26.val[1], b26.val[1], 2), b37.val[1], 2));
- out[7] = vreinterpretq_s16_s32(
- vextq_s32(b26.val[1], vextq_s32(b37.val[1], b37.val[1], 2), 2));
-#endif
+#define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+ out0, out1) \
+ do { \
+ int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \
+ u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \
+ int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \
+ v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \
+ *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
+ *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
+ } while (0)
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
}
-static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
- int32x4_t *output,
- const int size) {
- int i;
- for (i = 0; i < size; i++) {
- output[i] = vrshrq_n_s32(vmulq_n_s32(vrshrq_n_s32(input[i], 2), NewSqrt2),
- NewSqrt2Bits);
- }
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
}
-static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
- int32x4_t *output,
- const int size) {
- int i;
- for (i = 0; i < size; i++) output[i] = vrshrq_n_s32(input[i], 2);
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
}
-#define btf_32_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
- do { \
- out0 = vmulq_n_s32(in0, w0); \
- out0 = vmlaq_n_s32(out0, in1, w1); \
- out0 = vrshlq_s32(out0, v_cos_bit); \
- out1 = vmulq_n_s32(in0, w1); \
- out1 = vmlsq_n_s32(out1, in1, w0); \
- out1 = vrshlq_s32(out1, v_cos_bit); \
- } while (0)
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
-#define btf_32_type1_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
- do { \
- btf_32_neon(w1, w0, in1, in0, out0, out1, v_cos_bit); \
+#define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+ out0, out1) \
+ do { \
+ int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \
+ u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \
+ int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \
+ u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \
+ int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \
+ v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \
+ int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \
+ v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \
+ const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
+ const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \
+ const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
+ const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \
+ *out0 = vcombine_s16(c0, c1); \
+ *out1 = vcombine_s16(d0, d1); \
} while (0)
-#define btf_32_neon_mode0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
- do { \
- out0 = vmulq_n_s32(in1, w1); \
- out0 = vmlsq_n_s32(out0, in0, w0); \
- out0 = vrshlq_s32(out0, v_cos_bit); \
- out1 = vmulq_n_s32(in0, w1); \
- out1 = vmlaq_n_s32(out1, in1, w0); \
- out1 = vrshlq_s32(out1, v_cos_bit); \
- } while (0)
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
-#define btf_32_neon_mode01(w0, w1, in0, in1, out0, out1, v_cos_bit) \
- do { \
- out0 = vmulq_n_s32(in1, w1); \
- out0 = vmlaq_n_s32(out0, in0, w0); \
- out0 = vrshlq_s32(vnegq_s32(out0), v_cos_bit); \
- out1 = vmulq_n_s32(in1, w0); \
- out1 = vmlsq_n_s32(out1, in0, w1); \
- out1 = vrshlq_s32(out1, v_cos_bit); \
- } while (0)
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
-static INLINE void flip_buf_neon(int16x8_t *in, int16x8_t *out, int size) {
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
+ int size) {
for (int i = 0; i < size; ++i) {
out[size - i - 1] = in[i];
}
}
-static INLINE void store_16bit_to_32bit_w4(const int16x8_t a,
- int32_t *const b) {
- vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
-}
-
-static INLINE void store_16bit_to_32bit(int16x8_t a, int32_t *b) {
- vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
- vst1q_s32((b + 4), vmovl_s16(vget_high_s16(a)));
+static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
+ int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
}
-static INLINE void store_output_32bit_w8(int32_t *const out,
- const int32x4_t *const in1,
- const int32x4_t *const in2,
- const int stride, const int out_size) {
+static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
+ int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
+ const int stride, const int out_size) {
for (int i = 0; i < out_size; ++i) {
vst1q_s32(out + stride * i, in1[i]);
vst1q_s32(out + stride * i + 4, in2[i]);
}
}
-static INLINE void store_rect_16bit_to_32bit_w4(
- const int16x8_t a, int32_t *const b, const int16x4_t *v_newsqrt2,
- const int32x4_t *v_newsqrt2bits) {
- const int32x4_t b_lo =
- vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
- vst1q_s32(b, b_lo);
+static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
+ const int stride,
+ int16x4_t *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = vld1_s16(in);
+ in += stride;
+ }
}
-static INLINE void store_rect_16bit_to_32bit(const int16x8_t a,
- int32_t *const b,
- const int16x4_t *v_newsqrt2,
- const int32x4_t *v_newsqrt2bits) {
- const int32x4_t b_lo =
- vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
- const int32x4_t b_hi =
- vrshlq_s32(vmull_s16(vget_high_s16(a), *v_newsqrt2), *v_newsqrt2bits);
- vst1q_s32(b, b_lo);
- vst1q_s32((b + 4), b_hi);
+static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
+ int16x8_t *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = vld1q_s16(in + i * stride);
+ }
}
-static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *in,
+static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
+ int32_t *const out,
const int stride,
- int16x8_t *const out,
const int out_size) {
for (int i = 0; i < out_size; ++i) {
- // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
- // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
- // the upper lane is unused or further modified after this call. The
- // latency should be similar between the two.
- out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
- in += stride;
+ vst1q_s32(out + i * stride, vmovl_s16(in[i]));
}
}
-static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *in,
- const int stride,
- int16x8_t *const out,
- const int out_size) {
- for (int i = out_size - 1; i >= 0; --i) {
- // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
- // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
- // the upper lane is unused or further modified after this call. The
- // latency should be similar between the two.
- out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
- in += stride;
+static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
+ vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
}
}
-static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
- int16x8_t *out, int out_size) {
- for (int i = 0; i < out_size; ++i) {
- out[i] = vld1q_s16(in + i * stride);
- }
+// A note on naming:
+// round_shift_[sqrt2]_s16_s32_4x1_neon(...)
+// | | | ^ 1 => a single vector
+// | | | n => an array of vectors
+// | | | ^ input/output vector element count
+// | | ^ output type
+// | ^ input type
+// ^ multiplicand and shift identifier
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+ return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
}
-static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
- int stride, int16x8_t *out,
- int out_size) {
- for (int i = 0; i < out_size; ++i) {
- out[out_size - i - 1] = vld1q_s16(in + i * stride);
- }
+static AOM_FORCE_INLINE int16x8_t
+round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+ return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+ round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
}
-static INLINE void store_buffer_16bit_to_32bit_w4(const int16x8_t *const in,
- int32_t *const out,
- const int stride,
- const int out_size) {
- for (int i = 0; i < out_size; ++i) {
- store_16bit_to_32bit_w4(in[i], out + i * stride);
- }
+static AOM_FORCE_INLINE int16x4_t
+round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+ return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
}
-static INLINE void store_buffer_16bit_to_32bit_w8(const int16x8_t *const in,
- int32_t *const out,
- const int stride,
- const int out_size) {
- for (int i = 0; i < out_size; ++i) {
- store_16bit_to_32bit(in[i], out + i * stride);
- }
+static AOM_FORCE_INLINE int16x8_t
+round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+ return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+ round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
}
-static INLINE void store_rect_buffer_16bit_to_32bit_w4(
- const int16x8_t *const in, int32_t *const out, const int stride,
- const int out_size) {
- const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
- const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
- for (int i = 0; i < out_size; ++i) {
- store_rect_16bit_to_32bit_w4(in[i], out + i * stride, &v_newsqrt2,
- &v_newsqrt2bits);
- }
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
+ return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
}
-static INLINE void store_rect_buffer_16bit_to_32bit_w8(
- const int16x8_t *const in, int32_t *const out, const int stride,
- const int out_size) {
- const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
- const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
- for (int i = 0; i < out_size; ++i) {
- store_rect_16bit_to_32bit(in[i], out + i * stride, &v_newsqrt2,
- &v_newsqrt2bits);
- }
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
+ return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
}
-static INLINE void round_shift_16bit(int16x8_t *in, int size, int bit) {
- const int16x8_t vbit = vdupq_n_s16(bit);
- for (int i = 0; i < size; ++i) {
- in[i] = vrshlq_s16(in[i], vbit);
+#define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \
+ static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
+ for (int i = 0; i < size; ++i) { \
+ out[i] = fn(in[i]); \
+ } \
}
-}
-static INLINE void round_shift_16bit_vector(int16x8_t *in, int size,
- const int16x8_t *v_bit) {
- for (int i = 0; i < size; ++i) {
- in[i] = vrshlq_s16(in[i], *v_bit);
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
+ int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
+ int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
+ int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
+ int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
+ int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
}
}
-void av1_fadst4x4_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *sinpi = sinpi_arr(cos_bit);
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride + 0,
+ round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
+ vst1q_s32(out + i * stride + 4,
+ round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
+ }
+}
+static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
int32x4_t u[6], v[6];
-
- u[0] = vmovl_s16(vget_low_s16(input[0]));
- u[1] = vmovl_s16(vget_low_s16(input[1]));
- u[2] = vmovl_s16(vget_low_s16(input[2]));
- u[3] = vmovl_s16(vget_low_s16(input[3]));
- u[4] = vaddq_s32(u[0], u[1]);
- v[5] = vmulq_n_s32(u[2], sinpi[3]);
- v[0] = vmulq_n_s32(u[1], sinpi[2]);
- v[0] = vmlaq_n_s32(v[0], u[0], sinpi[1]);
- v[1] = vmlaq_n_s32(v[5], u[3], sinpi[4]);
- v[2] = vmulq_n_s32(u[4], sinpi[3]);
- v[3] = vmulq_n_s32(u[0], sinpi[4]);
- v[3] = vmlsq_n_s32(v[3], u[1], sinpi[1]);
- v[4] = vmlsq_n_s32(v[5], u[3], sinpi[2]);
+ const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+ const int16x4_t u01 = vqadd_s16(input[0], input[1]);
+
+ v[5] = vmull_lane_s16(input[2], sinpi, 2);
+ v[0] = vmull_lane_s16(input[1], sinpi, 1);
+ v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
+ v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
+ v[2] = vmull_lane_s16(u01, sinpi, 2);
+ v[3] = vmull_lane_s16(input[0], sinpi, 3);
+ v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
+ v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
u[0] = vaddq_s32(v[0], v[1]);
- u[1] = vmlsq_n_s32(v[2], u[3], sinpi[3]);
+ u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
u[2] = vsubq_s32(v[3], v[4]);
u[3] = vsubq_s32(u[2], u[0]);
- u[5] = vmlaq_n_s32(u[3], v[5], 3);
-
- int32x4_t vshift = vdupq_n_s32(-cos_bit);
- u[0] = vrshlq_s32(u[0], vshift);
- u[1] = vrshlq_s32(u[1], vshift);
- u[2] = vrshlq_s32(u[2], vshift);
- u[3] = vrshlq_s32(u[5], vshift);
+ u[3] = vmlaq_n_s32(u[3], v[5], 3);
- output[0] = custom_packs_s32(u[0], u[2]);
-
- output[1] = custom_packs_s32(u[1], u[3]);
- output[2] = vextq_s16(output[0], output[0], 4);
- output[3] = vextq_s16(output[1], output[1], 4);
+ output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+ output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+ output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+ output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
}
-#define btf_16_w4_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1, \
- v_cos_bit) \
- do { \
- int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in0_l, w0_l); \
- u0 = vmlaq_n_s32(u0, in1_l, w0_h); \
- int32x4_t v0 = vmulq_n_s32(in0_l, w1_l); \
- v0 = vmlaq_n_s32(v0, in1_l, w1_h); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- const int16x4_t c1 = vqmovn_s32(c0); \
- const int16x4_t d1 = vqmovn_s32(d0); \
- out0 = vcombine_s16(c1, c1); \
- out1 = vcombine_s16(d1, c1); \
- } while (0)
+static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
-#define btf_16_w4_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in1_l, w0_h); \
- u0 = vmlsq_n_s32(u0, in0_l, w0_l); \
- int32x4_t v0 = vmulq_n_s32(in0_l, w0_h); \
- v0 = vmlaq_n_s32(v0, in1_l, w0_l); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- const int16x4_t c1 = vqmovn_s32(c0); \
- const int16x4_t d1 = vqmovn_s32(d0); \
- out0 = vcombine_s16(c1, c1); \
- out1 = vcombine_s16(d1, c1); \
- } while (0)
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
-#define btf_16_w4_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in0_l, w0_l); \
- u0 = vmlaq_n_s32(u0, in1_l, w0_h); \
- int32x4_t v0 = vmulq_n_s32(in1_l, w0_l); \
- v0 = vmlsq_n_s32(v0, in0_l, w0_h); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- const int16x4_t c1 = vqmovn_s32(c0); \
- const int16x4_t d1 = vqmovn_s32(d0); \
- out0 = vcombine_s16(c1, c1); \
- out1 = vcombine_s16(d1, c1); \
- } while (0)
-
-#define btf_16_w4_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in0_l, w0_l); \
- u0 = vmlaq_n_s32(u0, in1_l, w0_h); \
- int32x4_t v0 = vmulq_n_s32(in0_l, w0_h); \
- v0 = vmlsq_n_s32(v0, in1_l, w0_l); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- const int16x4_t c1 = vqmovn_s32(c0); \
- const int16x4_t d1 = vqmovn_s32(d0); \
- out0 = vcombine_s16(c1, c1); \
- out1 = vcombine_s16(d1, c1); \
- } while (0)
-
-static void fadst4x8_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
// stage 1-2
- int16x8_t x2[8];
- btf_16_w4_neon_mode3(cospi[32], cospi[32], vqnegq_s16(input[3]), input[4],
- x2[2], x2[3], v_cos_bit);
- btf_16_w4_neon_mode3(cospi[32], cospi[32], input[2], vqnegq_s16(input[5]),
- x2[6], x2[7], v_cos_bit);
+ int16x4_t x2[8];
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
// stage 3
- int16x8_t x3[8];
- x3[0] = vqaddq_s16(input[0], x2[2]);
- x3[2] = vqsubq_s16(input[0], x2[2]);
- x3[1] = vqsubq_s16(x2[3], input[7]);
- x3[3] = vqsubq_s16(vqnegq_s16(input[7]), x2[3]);
- x3[4] = vqaddq_s16(vqnegq_s16(input[1]), x2[6]);
- x3[6] = vqsubq_s16(vqnegq_s16(input[1]), x2[6]);
- x3[5] = vqaddq_s16(input[6], x2[7]);
- x3[7] = vqsubq_s16(input[6], x2[7]);
+ int16x4_t x3[8];
+ x3[0] = vqadd_s16(input[0], x2[2]);
+ x3[1] = vqsub_s16(x2[3], input[7]);
+ x3[2] = vqsub_s16(input[0], x2[2]);
+ x3[3] = vqadd_s16(input[7], x2[3]);
+ x3[4] = vqsub_s16(x2[6], input[1]);
+ x3[5] = vqadd_s16(input[6], x2[7]);
+ x3[6] = vqadd_s16(input[1], x2[6]);
+ x3[7] = vqsub_s16(input[6], x2[7]);
// stage 4
- int16x8_t x4[8];
-
- btf_16_w4_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x4[4], x4[5],
- v_cos_bit);
- btf_16_w4_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x4[6], x4[7],
- v_cos_bit);
+ int16x4_t x4[8];
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
// stage 5
- int16x8_t x5[8];
- x5[0] = vqaddq_s16(x3[0], x4[4]);
- x5[4] = vqsubq_s16(x3[0], x4[4]);
- x5[1] = vqaddq_s16(x3[1], x4[5]);
- x5[5] = vqsubq_s16(x3[1], x4[5]);
- x5[2] = vqaddq_s16(x3[2], x4[6]);
- x5[6] = vqsubq_s16(x3[2], x4[6]);
- x5[3] = vqaddq_s16(x3[3], x4[7]);
- x5[7] = vqsubq_s16(x3[3], x4[7]);
+ int16x4_t x5[8];
+ x5[0] = vqadd_s16(x3[0], x4[4]);
+ x5[1] = vqadd_s16(x3[1], x4[5]);
+ x5[2] = vqadd_s16(x3[2], x4[6]);
+ x5[3] = vqsub_s16(x4[7], x3[3]);
+ x5[4] = vqsub_s16(x3[0], x4[4]);
+ x5[5] = vqsub_s16(x3[1], x4[5]);
+ x5[6] = vqsub_s16(x3[2], x4[6]);
+ x5[7] = vqadd_s16(x3[3], x4[7]);
// stage 6-7
- btf_16_w4_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
- v_cos_bit);
- btf_16_w4_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
- v_cos_bit);
- btf_16_w4_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
- v_cos_bit);
- btf_16_w4_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
- v_cos_bit);
+ butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+ butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+ butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
}
-static void fadst8x4_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *sinpi = sinpi_arr(cos_bit);
-
- const int16x8_t in7 = vaddq_s16(input[0], input[1]);
- int32x4_t u_lo[8], u_hi[8], v_hi[8];
-
- int32x4_t in0_l = vmovl_s16(vget_low_s16(input[0]));
- int32x4_t in0_h = vmovl_s16(vget_high_s16(input[0]));
- int32x4_t in1_l = vmovl_s16(vget_low_s16(input[1]));
- int32x4_t in1_h = vmovl_s16(vget_high_s16(input[1]));
- int32x4_t in2_l = vmovl_s16(vget_low_s16(input[2]));
- int32x4_t in2_h = vmovl_s16(vget_high_s16(input[2]));
- int32x4_t in3_l = vmovl_s16(vget_low_s16(input[3]));
- int32x4_t in3_h = vmovl_s16(vget_high_s16(input[3]));
- int32x4_t in7_l = vmovl_s16(vget_low_s16(in7));
- int32x4_t in7_h = vmovl_s16(vget_high_s16(in7));
-
- u_lo[0] = vmulq_n_s32(in1_l, sinpi[2]);
- u_lo[0] = vmlaq_n_s32(u_lo[0], in0_l, sinpi[1]);
-
- u_hi[0] = vmulq_n_s32(in1_h, sinpi[2]);
- u_hi[0] = vmlaq_n_s32(u_hi[0], in0_h, sinpi[1]);
+static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ int32x4_t u_lo[4], u_hi[4];
+ const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+ const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
- u_lo[0] = vmlaq_n_s32(u_lo[0], in3_l, sinpi[4]);
- u_lo[0] = vmlaq_n_s32(u_lo[0], in2_l, sinpi[3]);
+ u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
+ u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
- u_hi[0] = vmlaq_n_s32(u_hi[0], in3_h, sinpi[4]);
- u_hi[0] = vmlaq_n_s32(u_hi[0], in2_h, sinpi[3]);
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
- u_lo[1] = vmulq_n_s32(in7_l, sinpi[3]);
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
- v_hi[2] = vmulq_n_s32(in7_h, sinpi[3]);
- u_lo[2] = vmulq_n_s32(in0_l, sinpi[4]);
- u_lo[2] = vmlsq_n_s32(u_lo[2], in1_l, sinpi[1]);
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
- u_hi[2] = vmulq_n_s32(in0_h, sinpi[4]);
- u_hi[2] = vmlsq_n_s32(u_hi[2], in1_h, sinpi[1]);
+ u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
+ u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
- u_lo[2] = vmlaq_n_s32(u_lo[2], in3_l, sinpi[2]);
- u_lo[2] = vmlsq_n_s32(u_lo[2], in2_l, sinpi[3]);
+ u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
+ u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
- u_hi[2] = vmlaq_n_s32(u_hi[2], in3_h, sinpi[2]);
- u_hi[2] = vmlsq_n_s32(u_hi[2], in2_h, sinpi[3]);
+ u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
+ u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
- u_lo[1] = vmlsq_n_s32(u_lo[1], in3_l, sinpi[3]);
+ u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
+ u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+ u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
+ u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
- u_hi[1] = vmlsq_n_s32(v_hi[2], in3_h, sinpi[3]);
+ u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
+ u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
- u_lo[6] = vmlaq_n_s32(u_lo[3], in2_l, sinpi[3] * 3);
- u_hi[6] = vmlaq_n_s32(u_hi[3], in2_h, sinpi[3] * 3);
-
- u_lo[0] = vrshlq_s32(u_lo[0], v_cos_bit);
- u_hi[0] = vrshlq_s32(u_hi[0], v_cos_bit);
- u_lo[1] = vrshlq_s32(u_lo[1], v_cos_bit);
- u_hi[1] = vrshlq_s32(u_hi[1], v_cos_bit);
- u_lo[2] = vrshlq_s32(u_lo[2], v_cos_bit);
- u_hi[2] = vrshlq_s32(u_hi[2], v_cos_bit);
- u_lo[3] = vrshlq_s32(u_lo[6], v_cos_bit);
- u_hi[3] = vrshlq_s32(u_hi[6], v_cos_bit);
-
- output[0] = custom_packs_s32(u_lo[0], u_hi[0]);
- output[1] = custom_packs_s32(u_lo[1], u_hi[1]);
- output[2] = custom_packs_s32(u_lo[2], u_hi[2]);
- output[3] = custom_packs_s32(u_lo[3], u_hi[3]);
+ const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
+ u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
+ u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
+
+ output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
+ output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
+ output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
+ output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
}
-void av1_fdct4x4_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+ const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
- int32x4_t u[4];
+ int16x4_t in12a = vadd_s16(input[1], input[2]);
+ int16x4_t in12s = vsub_s16(input[1], input[2]);
+ int16x4_t in03a = vadd_s16(input[0], input[3]);
+ int16x4_t in03s = vsub_s16(input[0], input[3]);
- int32x4_t in12a = vaddl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
- int32x4_t in12s = vsubl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
- int32x4_t in03a = vaddl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
- int32x4_t in03s = vsubl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
+ int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
+ int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
- int32x4_t u0ad1 = vmulq_n_s32(in12a, cospi[32]);
- int32x4_t u0ad2 = vmulq_n_s32(in03a, cospi[32]);
+ int32x4_t u[4];
u[0] = vaddq_s32(u0ad1, u0ad2);
u[1] = vsubq_s32(u0ad2, u0ad1);
- u[2] = vmulq_n_s32(in12s, cospi[48]);
- u[2] = vmlaq_n_s32(u[2], in03s, cospi[16]);
-
- u[3] = vmulq_n_s32(in03s, cospi[48]);
- u[3] = vmlsq_n_s32(u[3], in12s, cospi[16]);
+ u[2] = vmull_lane_s16(in12s, cospi16, 1);
+ u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
+ u[3] = vmull_lane_s16(in03s, cospi16, 1);
+ u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
+
+ output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+ output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+ output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+ output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
+}
- u[0] = vrshlq_s32(u[0], v_cos_bit);
- u[1] = vrshlq_s32(u[1], v_cos_bit);
- u[2] = vrshlq_s32(u[2], v_cos_bit);
- u[3] = vrshlq_s32(u[3], v_cos_bit);
+// Butterfly pre-processing:
+// e.g. n=4:
+// out[0] = in[0] + in[3]
+// out[1] = in[1] + in[2]
+// out[2] = in[1] - in[2]
+// out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
+ int16x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqadd_s16(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
- output[0] = custom_packs_s32(u[0], u[1]);
- output[1] = custom_packs_s32(u[2], u[3]);
- output[2] = vextq_s16(output[0], output[0], 4);
- output[3] = vextq_s16(output[1], output[1], 4);
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
+ int16x8_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqaddq_s16(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
}
-#define btf_16_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1) \
- do { \
- int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
- int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
- u0 = vmlaq_n_s32(u0, in_low0, w0_l); \
- int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
- u1 = vmlaq_n_s32(u1, in_high0, w0_l); \
- int32x4_t v0 = vmulq_n_s32(in_low1, w1_h); \
- v0 = vmlaq_n_s32(v0, in_low0, w1_l); \
- int32x4_t v1 = vmulq_n_s32(in_high1, w1_h); \
- v1 = vmlaq_n_s32(v1, in_high0, w1_l); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
- out0 = custom_packs_s32(c0, c1); \
- out1 = custom_packs_s32(d0, d1); \
- } while (0)
+static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
+ int32x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqaddq_s32(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
-#define btf_16_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
- int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
- u0 = vmlsq_n_s32(u0, in_low0, w0_l); \
- int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
- u1 = vmlsq_n_s32(u1, in_high0, w0_l); \
- int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
- v0 = vmlaq_n_s32(v0, in_low0, w0_h); \
- int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
- v1 = vmlaq_n_s32(v1, in_high0, w0_h); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
- out0 = custom_packs_s32(c0, c1); \
- out1 = custom_packs_s32(d0, d1); \
- } while (0)
+// Butterfly post-processing:
+// e.g. n=8:
+// out[0] = in0[0] + in1[3];
+// out[1] = in0[1] + in1[2];
+// out[2] = in0[1] - in1[2];
+// out[3] = in0[0] - in1[3];
+// out[4] = in0[7] - in1[4];
+// out[5] = in0[6] - in1[5];
+// out[6] = in0[6] + in1[5];
+// out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
+ const int16x4_t *in1,
+ int16x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
-#define btf_16_neon_mode1(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
- int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in_low0, w0_l); \
- u0 = vmlsq_n_s32(u0, in_low1, w0_h); \
- int32x4_t u1 = vmulq_n_s32(in_high0, w0_l); \
- u1 = vmlsq_n_s32(u1, in_high1, w0_h); \
- int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
- v0 = vmlaq_n_s32(v0, in_low0, w0_h); \
- int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
- v1 = vmlaq_n_s32(v1, in_high0, w0_h); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
- out0 = custom_packs_s32(c0, c1); \
- out1 = custom_packs_s32(d0, d1); \
- } while (0)
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
+ const int16x8_t *in1,
+ int16x8_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
-#define btf_16_neon_mode02(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
- int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in_low1, -w0_h); \
- u0 = vmlsq_n_s32(u0, in_low0, w0_l); \
- int32x4_t u1 = vmulq_n_s32(in_high1, -w0_h); \
- u1 = vmlsq_n_s32(u1, in_high0, w0_l); \
- int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
- v0 = vmlsq_n_s32(v0, in_low0, w0_h); \
- int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
- v1 = vmlsq_n_s32(v1, in_high0, w0_h); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
- out0 = custom_packs_s32(c0, c1); \
- out1 = custom_packs_s32(d0, d1); \
- } while (0)
+static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
+ const int32x4_t *in1,
+ int32x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
-#define btf_16_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
- int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
- u0 = vmlaq_n_s32(u0, in_low0, w0_l); \
- int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
- u1 = vmlaq_n_s32(u1, in_high0, w0_l); \
- int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
- v0 = vmlsq_n_s32(v0, in_low0, w0_h); \
- int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
- v1 = vmlsq_n_s32(v1, in_high0, w0_h); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
- out0 = custom_packs_s32(c0, c1); \
- out1 = custom_packs_s32(d0, d1); \
- } while (0)
+static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
-#define btf_16_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
- do { \
- int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
- int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
- int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
- int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
- int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
- u0 = vmlaq_n_s32(u0, in_low0, w0_l); \
- int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
- u1 = vmlaq_n_s32(u1, in_high0, w0_l); \
- int32x4_t v0 = vmulq_n_s32(in_low0, w0_h); \
- v0 = vmlsq_n_s32(v0, in_low1, w0_l); \
- int32x4_t v1 = vmulq_n_s32(in_high0, w0_h); \
- v1 = vmlsq_n_s32(v1, in_high1, w0_l); \
- int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
- int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
- int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
- int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
- out0 = custom_packs_s32(c0, c1); \
- out1 = custom_packs_s32(d0, d1); \
- } while (0)
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
-static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
// stage 1
int16x8_t x1[4];
- x1[0] = vqaddq_s16(input[0], input[3]);
- x1[3] = vqsubq_s16(input[0], input[3]);
- x1[1] = vqaddq_s16(input[1], input[2]);
- x1[2] = vqsubq_s16(input[1], input[2]);
+ butterfly_dct_pre_s16_x8(input, x1, 4);
// stage 2
int16x8_t x2[4];
- btf_16_neon_mode3(cospi[32], cospi[32], x1[0], x1[1], x2[0], x2[1],
- v_cos_bit);
- btf_16_neon_mode2(cospi[48], cospi[16], x1[2], x1[3], x2[2], x2[3],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
// stage 3
output[0] = x2[0];
@@ -806,988 +611,528 @@ static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
output[3] = x2[3];
}
-static void fdct4x8_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
// stage 1
- int16x8_t x1[8];
- x1[0] = vqaddq_s16(input[0], input[7]);
- x1[7] = vqsubq_s16(input[0], input[7]);
- x1[1] = vqaddq_s16(input[1], input[6]);
- x1[6] = vqsubq_s16(input[1], input[6]);
- x1[2] = vqaddq_s16(input[2], input[5]);
- x1[5] = vqsubq_s16(input[2], input[5]);
- x1[3] = vqaddq_s16(input[3], input[4]);
- x1[4] = vqsubq_s16(input[3], input[4]);
+ int16x4_t x1[8];
+ butterfly_dct_pre_s16_x4(input, x1, 8);
// stage 2
- int16x8_t x2[8];
- x2[0] = vqaddq_s16(x1[0], x1[3]);
- x2[3] = vqsubq_s16(x1[0], x1[3]);
- x2[1] = vqaddq_s16(x1[1], x1[2]);
- x2[2] = vqsubq_s16(x1[1], x1[2]);
-
- btf_16_w4_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
- v_cos_bit);
+ int16x4_t x2[8];
+ butterfly_dct_pre_s16_x4(x1, x2, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
// stage 3
- int16x8_t x3[8];
- btf_16_w4_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
- v_cos_bit);
-
- btf_16_w4_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
- v_cos_bit);
- x3[4] = vqaddq_s16(x1[4], x2[5]);
- x3[5] = vqsubq_s16(x1[4], x2[5]);
- x3[6] = vqsubq_s16(x1[7], x2[6]);
- x3[7] = vqaddq_s16(x1[7], x2[6]);
+ int16x4_t x3[8];
+ butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+ butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
// stage 4-5
- btf_16_w4_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
- v_cos_bit);
- btf_16_w4_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
- v_cos_bit);
+ butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
}
-void fdct8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
// stage 1
int16x8_t x1[8];
- x1[0] = vqaddq_s16(input[0], input[7]);
- x1[7] = vqsubq_s16(input[0], input[7]);
- x1[1] = vqaddq_s16(input[1], input[6]);
- x1[6] = vqsubq_s16(input[1], input[6]);
- x1[2] = vqaddq_s16(input[2], input[5]);
- x1[5] = vqsubq_s16(input[2], input[5]);
- x1[3] = vqaddq_s16(input[3], input[4]);
- x1[4] = vqsubq_s16(input[3], input[4]);
+ butterfly_dct_pre_s16_x8(input, x1, 8);
// stage 2
int16x8_t x2[8];
- x2[0] = vqaddq_s16(x1[0], x1[3]);
- x2[3] = vqsubq_s16(x1[0], x1[3]);
- x2[1] = vqaddq_s16(x1[1], x1[2]);
- x2[2] = vqsubq_s16(x1[1], x1[2]);
- btf_16_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
- v_cos_bit);
+ butterfly_dct_pre_s16_x8(x1, x2, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
// stage 3
int16x8_t x3[8];
- btf_16_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
- v_cos_bit);
- btf_16_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
- v_cos_bit);
- x3[4] = vqaddq_s16(x1[4], x2[5]);
- x3[5] = vqsubq_s16(x1[4], x2[5]);
- x3[6] = vqsubq_s16(x1[7], x2[6]);
- x3[7] = vqaddq_s16(x1[7], x2[6]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+ butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
// stage 4-5
- btf_16_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
- v_cos_bit);
- btf_16_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 1
+ int16x4_t x1[16];
+ butterfly_dct_pre_s16_x4(input, x1, 16);
+
+ // stage 2
+ int16x4_t x2[16];
+ butterfly_dct_pre_s16_x4(x1, x2, 8);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
+
+ // stage 3
+ int16x4_t x3[16];
+ butterfly_dct_pre_s16_x4(x2, x3, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+ butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
+
+ // stage 4
+ int16x4_t x4[16];
+ butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
+ &output[12]);
+ butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+ butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+ // stage 5
+ int16x4_t x5[16];
+ butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
+ &output[6]);
+ butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
+ butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
+
+ // stage 6-7
+ butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
+ &output[15]);
+ butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
+ &output[7]);
+ butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
+ &output[11]);
+ butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
+ &output[3]);
}
-static void fdct8x16_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
// stage 1
int16x8_t x1[16];
- x1[0] = vqaddq_s16(input[0], input[15]);
- x1[15] = vqsubq_s16(input[0], input[15]);
- x1[1] = vqaddq_s16(input[1], input[14]);
- x1[14] = vqsubq_s16(input[1], input[14]);
- x1[2] = vqaddq_s16(input[2], input[13]);
- x1[13] = vqsubq_s16(input[2], input[13]);
- x1[3] = vqaddq_s16(input[3], input[12]);
- x1[12] = vqsubq_s16(input[3], input[12]);
- x1[4] = vqaddq_s16(input[4], input[11]);
- x1[11] = vqsubq_s16(input[4], input[11]);
- x1[5] = vqaddq_s16(input[5], input[10]);
- x1[10] = vqsubq_s16(input[5], input[10]);
- x1[6] = vqaddq_s16(input[6], input[9]);
- x1[9] = vqsubq_s16(input[6], input[9]);
- x1[7] = vqaddq_s16(input[7], input[8]);
- x1[8] = vqsubq_s16(input[7], input[8]);
+ butterfly_dct_pre_s16_x8(input, x1, 16);
// stage 2
int16x8_t x2[16];
- x2[0] = vqaddq_s16(x1[0], x1[7]);
- x2[7] = vqsubq_s16(x1[0], x1[7]);
- x2[1] = vqaddq_s16(x1[1], x1[6]);
- x2[6] = vqsubq_s16(x1[1], x1[6]);
- x2[2] = vqaddq_s16(x1[2], x1[5]);
- x2[5] = vqsubq_s16(x1[2], x1[5]);
- x2[3] = vqaddq_s16(x1[3], x1[4]);
- x2[4] = vqsubq_s16(x1[3], x1[4]);
-
- btf_16_neon_mode0(cospi[32], cospi[32], x1[10], x1[13], x2[10], x2[13],
- v_cos_bit);
- btf_16_neon_mode0(cospi[32], cospi[32], x1[11], x1[12], x2[11], x2[12],
- v_cos_bit);
+ butterfly_dct_pre_s16_x8(x1, x2, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
// stage 3
int16x8_t x3[16];
- x3[0] = vqaddq_s16(x2[0], x2[3]);
- x3[3] = vqsubq_s16(x2[0], x2[3]);
- x3[1] = vqaddq_s16(x2[1], x2[2]);
- x3[2] = vqsubq_s16(x2[1], x2[2]);
-
- btf_16_neon_mode0(cospi[32], cospi[32], x2[5], x2[6], x3[5], x3[6],
- v_cos_bit);
-
- x3[8] = vqaddq_s16(x1[8], x2[11]);
- x3[11] = vqsubq_s16(x1[8], x2[11]);
- x3[9] = vqaddq_s16(x1[9], x2[10]);
- x3[10] = vqsubq_s16(x1[9], x2[10]);
- x3[12] = vqsubq_s16(x1[15], x2[12]);
- x3[15] = vqaddq_s16(x1[15], x2[12]);
- x3[13] = vqsubq_s16(x1[14], x2[13]);
- x3[14] = vqaddq_s16(x1[14], x2[13]);
+ butterfly_dct_pre_s16_x8(x2, x3, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+ butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
// stage 4
int16x8_t x4[16];
- btf_16_neon(cospi[32], cospi[32], cospi[32], -cospi[32], x3[0], x3[1],
- output[0], output[8]);
- btf_16_neon(cospi[48], cospi[16], -cospi[16], cospi[48], x3[2], x3[3],
- output[4], output[12]);
- x4[4] = vqaddq_s16(x2[4], x3[5]);
- x4[5] = vqsubq_s16(x2[4], x3[5]);
- x4[6] = vqsubq_s16(x2[7], x3[6]);
- x4[7] = vqaddq_s16(x2[7], x3[6]);
- btf_16_neon_mode0(cospi[16], cospi[48], x3[9], x3[14], x4[9], x4[14],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x3[10], x3[13], x4[10], x4[13],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
+ &output[12]);
+ butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
// stage 5
int16x8_t x5[16];
-
- btf_16_neon_mode2(cospi[56], cospi[8], x4[4], x4[7], output[2], output[14],
- v_cos_bit);
- btf_16_neon_mode2(cospi[24], cospi[40], x4[5], x4[6], output[10], output[6],
- v_cos_bit);
- x5[8] = vqaddq_s16(x3[8], x4[9]);
- x5[9] = vqsubq_s16(x3[8], x4[9]);
- x5[10] = vqsubq_s16(x3[11], x4[10]);
- x5[11] = vqaddq_s16(x3[11], x4[10]);
- x5[12] = vqaddq_s16(x3[12], x4[13]);
- x5[13] = vqsubq_s16(x3[12], x4[13]);
- x5[14] = vqsubq_s16(x3[15], x4[14]);
- x5[15] = vqaddq_s16(x3[15], x4[14]);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
+ &output[6]);
+ butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
+ butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
// stage 6-7
- btf_16_neon_mode2(cospi[60], cospi[4], x5[8], x5[15], output[1], output[15],
- v_cos_bit);
- btf_16_neon_mode2(cospi[28], cospi[36], x5[9], x5[14], output[9], output[7],
- v_cos_bit);
- btf_16_neon_mode2(cospi[44], cospi[20], x5[10], x5[13], output[5], output[11],
- v_cos_bit);
- btf_16_neon_mode2(cospi[12], cospi[52], x5[11], x5[12], output[13], output[3],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
+ &output[15]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
+ &output[11]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
+ &output[3]);
}
-void av1_fdct8x32_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
// stage 1
int16x8_t x1[32];
- x1[0] = vqaddq_s16(input[0], input[31]);
- x1[31] = vqsubq_s16(input[0], input[31]);
- x1[1] = vqaddq_s16(input[1], input[30]);
- x1[30] = vqsubq_s16(input[1], input[30]);
- x1[2] = vqaddq_s16(input[2], input[29]);
- x1[29] = vqsubq_s16(input[2], input[29]);
- x1[3] = vqaddq_s16(input[3], input[28]);
- x1[28] = vqsubq_s16(input[3], input[28]);
- x1[4] = vqaddq_s16(input[4], input[27]);
- x1[27] = vqsubq_s16(input[4], input[27]);
- x1[5] = vqaddq_s16(input[5], input[26]);
- x1[26] = vqsubq_s16(input[5], input[26]);
- x1[6] = vqaddq_s16(input[6], input[25]);
- x1[25] = vqsubq_s16(input[6], input[25]);
- x1[7] = vqaddq_s16(input[7], input[24]);
- x1[24] = vqsubq_s16(input[7], input[24]);
- x1[8] = vqaddq_s16(input[8], input[23]);
- x1[23] = vqsubq_s16(input[8], input[23]);
- x1[9] = vqaddq_s16(input[9], input[22]);
- x1[22] = vqsubq_s16(input[9], input[22]);
- x1[10] = vqaddq_s16(input[10], input[21]);
- x1[21] = vqsubq_s16(input[10], input[21]);
- x1[11] = vqaddq_s16(input[11], input[20]);
- x1[20] = vqsubq_s16(input[11], input[20]);
- x1[12] = vqaddq_s16(input[12], input[19]);
- x1[19] = vqsubq_s16(input[12], input[19]);
- x1[13] = vqaddq_s16(input[13], input[18]);
- x1[18] = vqsubq_s16(input[13], input[18]);
- x1[14] = vqaddq_s16(input[14], input[17]);
- x1[17] = vqsubq_s16(input[14], input[17]);
- x1[15] = vqaddq_s16(input[15], input[16]);
- x1[16] = vqsubq_s16(input[15], input[16]);
+ butterfly_dct_pre_s16_x8(input, x1, 32);
// stage 2
int16x8_t x2[32];
- x2[0] = vqaddq_s16(x1[0], x1[15]);
- x2[15] = vqsubq_s16(x1[0], x1[15]);
- x2[1] = vqaddq_s16(x1[1], x1[14]);
- x2[14] = vqsubq_s16(x1[1], x1[14]);
- x2[2] = vqaddq_s16(x1[2], x1[13]);
- x2[13] = vqsubq_s16(x1[2], x1[13]);
- x2[3] = vqaddq_s16(x1[3], x1[12]);
- x2[12] = vqsubq_s16(x1[3], x1[12]);
- x2[4] = vqaddq_s16(x1[4], x1[11]);
- x2[11] = vqsubq_s16(x1[4], x1[11]);
- x2[5] = vqaddq_s16(x1[5], x1[10]);
- x2[10] = vqsubq_s16(x1[5], x1[10]);
- x2[6] = vqaddq_s16(x1[6], x1[9]);
- x2[9] = vqsubq_s16(x1[6], x1[9]);
- x2[7] = vqaddq_s16(x1[7], x1[8]);
- x2[8] = vqsubq_s16(x1[7], x1[8]);
-
- btf_16_neon_mode0(cospi[32], cospi[32], x1[20], x1[27], x2[20], x2[27],
- v_cos_bit);
- btf_16_neon_mode0(cospi[32], cospi[32], x1[21], x1[26], x2[21], x2[26],
- v_cos_bit);
- btf_16_neon_mode0(cospi[32], cospi[32], x1[22], x1[25], x2[22], x2[25],
- v_cos_bit);
- btf_16_neon_mode0(cospi[32], cospi[32], x1[23], x1[24], x2[23], x2[24],
- v_cos_bit);
+ butterfly_dct_pre_s16_x8(x1, x2, 16);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
// stage 3
int16x8_t x3[32];
- x3[0] = vqaddq_s16(x2[0], x2[7]);
- x3[7] = vqsubq_s16(x2[0], x2[7]);
- x3[1] = vqaddq_s16(x2[1], x2[6]);
- x3[6] = vqsubq_s16(x2[1], x2[6]);
- x3[2] = vqaddq_s16(x2[2], x2[5]);
- x3[5] = vqsubq_s16(x2[2], x2[5]);
- x3[3] = vqaddq_s16(x2[3], x2[4]);
- x3[4] = vqsubq_s16(x2[3], x2[4]);
-
- btf_16_neon_mode0(cospi[32], cospi[32], x2[10], x2[13], x3[10], x3[13],
- v_cos_bit);
- btf_16_neon_mode0(cospi[32], cospi[32], x2[11], x2[12], x3[11], x3[12],
- v_cos_bit);
-
- x3[16] = vqaddq_s16(x1[16], x2[23]);
- x3[23] = vqsubq_s16(x1[16], x2[23]);
- x3[17] = vqaddq_s16(x1[17], x2[22]);
- x3[22] = vqsubq_s16(x1[17], x2[22]);
- x3[18] = vqaddq_s16(x1[18], x2[21]);
- x3[21] = vqsubq_s16(x1[18], x2[21]);
- x3[19] = vqaddq_s16(x1[19], x2[20]);
- x3[20] = vqsubq_s16(x1[19], x2[20]);
- x3[24] = vqsubq_s16(x1[31], x2[24]);
- x3[31] = vqaddq_s16(x1[31], x2[24]);
- x3[25] = vqsubq_s16(x1[30], x2[25]);
- x3[30] = vqaddq_s16(x1[30], x2[25]);
- x3[26] = vqsubq_s16(x1[29], x2[26]);
- x3[29] = vqaddq_s16(x1[29], x2[26]);
- x3[27] = vqsubq_s16(x1[28], x2[27]);
- x3[28] = vqaddq_s16(x1[28], x2[27]);
+ butterfly_dct_pre_s16_x8(x2, x3, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
+ butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
// stage 4
int16x8_t x4[32];
- x4[0] = vqaddq_s16(x3[0], x3[3]);
- x4[3] = vqsubq_s16(x3[0], x3[3]);
- x4[1] = vqaddq_s16(x3[1], x3[2]);
- x4[2] = vqsubq_s16(x3[1], x3[2]);
- btf_16_neon_mode0(cospi[32], cospi[32], x3[5], x3[6], x4[5], x4[6],
- v_cos_bit);
- x4[8] = vqaddq_s16(x2[8], x3[11]);
- x4[11] = vqsubq_s16(x2[8], x3[11]);
- x4[9] = vqaddq_s16(x2[9], x3[10]);
- x4[10] = vqsubq_s16(x2[9], x3[10]);
- x4[12] = vqsubq_s16(x2[15], x3[12]);
- x4[15] = vqaddq_s16(x2[15], x3[12]);
- x4[13] = vqsubq_s16(x2[14], x3[13]);
- x4[14] = vqaddq_s16(x2[14], x3[13]);
-
- btf_16_neon_mode0(cospi[16], cospi[48], x3[18], x3[29], x4[18], x4[29],
- v_cos_bit);
- btf_16_neon_mode0(cospi[16], cospi[48], x3[19], x3[28], x4[19], x4[28],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x3[20], x3[27], x4[20], x4[27],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x3[21], x3[26], x4[21], x4[26],
- v_cos_bit);
+ butterfly_dct_pre_s16_x8(x3, x4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
+ butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
// stage 5
int16x8_t x5[32];
- btf_16_neon_mode3(cospi[32], cospi[32], x4[0], x4[1], output[0], output[16],
- v_cos_bit);
- btf_16_neon_mode2(cospi[48], cospi[16], x4[2], x4[3], output[8], output[24],
- v_cos_bit);
- x5[4] = vqaddq_s16(x3[4], x4[5]);
- x5[5] = vqsubq_s16(x3[4], x4[5]);
- x5[6] = vqsubq_s16(x3[7], x4[6]);
- x5[7] = vqaddq_s16(x3[7], x4[6]);
-
- btf_16_neon_mode0(cospi[16], cospi[48], x4[9], x4[14], x5[9], x5[14],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x4[10], x4[13], x5[10], x5[13],
- v_cos_bit);
-
- x5[16] = vqaddq_s16(x3[16], x4[19]);
- x5[19] = vqsubq_s16(x3[16], x4[19]);
- x5[17] = vqaddq_s16(x3[17], x4[18]);
- x5[18] = vqsubq_s16(x3[17], x4[18]);
- x5[20] = vqsubq_s16(x3[23], x4[20]);
- x5[23] = vqaddq_s16(x3[23], x4[20]);
- x5[21] = vqsubq_s16(x3[22], x4[21]);
- x5[22] = vqaddq_s16(x3[22], x4[21]);
- x5[24] = vqaddq_s16(x3[24], x4[27]);
- x5[27] = vqsubq_s16(x3[24], x4[27]);
- x5[25] = vqaddq_s16(x3[25], x4[26]);
- x5[26] = vqsubq_s16(x3[25], x4[26]);
- x5[28] = vqsubq_s16(x3[31], x4[28]);
- x5[31] = vqaddq_s16(x3[31], x4[28]);
- x5[29] = vqsubq_s16(x3[30], x4[29]);
- x5[30] = vqaddq_s16(x3[30], x4[29]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
+ &output[16]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
+ &output[24]);
+ butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
+ butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
+ butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
// stage 6
int16x8_t x6[32];
- btf_16_neon_mode2(cospi[56], cospi[8], x5[4], x5[7], output[4], output[28],
- v_cos_bit);
- btf_16_neon_mode2(cospi[24], cospi[40], x5[5], x5[6], output[20], output[12],
- v_cos_bit);
- x6[8] = vqaddq_s16(x4[8], x5[9]);
- x6[9] = vqsubq_s16(x4[8], x5[9]);
- x6[10] = vqsubq_s16(x4[11], x5[10]);
- x6[11] = vqaddq_s16(x4[11], x5[10]);
- x6[12] = vqaddq_s16(x4[12], x5[13]);
- x6[13] = vqsubq_s16(x4[12], x5[13]);
- x6[14] = vqsubq_s16(x4[15], x5[14]);
- x6[15] = vqaddq_s16(x4[15], x5[14]);
- btf_16_neon_mode0(cospi[8], cospi[56], x5[17], x5[30], x6[17], x6[30],
- v_cos_bit);
- btf_16_neon_mode02(cospi[56], cospi[8], x5[18], x5[29], x6[18], x6[29],
- v_cos_bit);
- btf_16_neon_mode0(cospi[40], cospi[24], x5[21], x5[26], x6[21], x6[26],
- v_cos_bit);
- btf_16_neon_mode02(cospi[24], cospi[40], x5[22], x5[25], x6[22], x6[25],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
+ &output[12]);
+ butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
+ butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
// stage 7
int16x8_t x7[32];
- btf_16_neon_mode2(cospi[60], cospi[4], x6[8], x6[15], output[2], output[30],
- v_cos_bit);
- btf_16_neon_mode2(cospi[28], cospi[36], x6[9], x6[14], output[18], output[14],
- v_cos_bit);
- btf_16_neon_mode2(cospi[44], cospi[20], x6[10], x6[13], output[10],
- output[22], v_cos_bit);
- btf_16_neon_mode2(cospi[12], cospi[52], x6[11], x6[12], output[26], output[6],
- v_cos_bit);
- x7[16] = vqaddq_s16(x5[16], x6[17]);
- x7[17] = vqsubq_s16(x5[16], x6[17]);
- x7[18] = vqsubq_s16(x5[19], x6[18]);
- x7[19] = vqaddq_s16(x5[19], x6[18]);
- x7[20] = vqaddq_s16(x5[20], x6[21]);
- x7[21] = vqsubq_s16(x5[20], x6[21]);
- x7[22] = vqsubq_s16(x5[23], x6[22]);
- x7[23] = vqaddq_s16(x5[23], x6[22]);
- x7[24] = vqaddq_s16(x5[24], x6[25]);
- x7[25] = vqsubq_s16(x5[24], x6[25]);
- x7[26] = vqsubq_s16(x5[27], x6[26]);
- x7[27] = vqaddq_s16(x5[27], x6[26]);
- x7[28] = vqaddq_s16(x5[28], x6[29]);
- x7[29] = vqsubq_s16(x5[28], x6[29]);
- x7[30] = vqsubq_s16(x5[31], x6[30]);
- x7[31] = vqaddq_s16(x5[31], x6[30]);
-
- btf_16_neon_mode2(cospi[62], cospi[2], x7[16], x7[31], output[1], output[31],
- v_cos_bit);
- btf_16_neon_mode2(cospi[30], cospi[34], x7[17], x7[30], output[17],
- output[15], v_cos_bit);
- btf_16_neon_mode2(cospi[46], cospi[18], x7[18], x7[29], output[9], output[23],
- v_cos_bit);
- btf_16_neon_mode2(cospi[14], cospi[50], x7[19], x7[28], output[25], output[7],
- v_cos_bit);
- btf_16_neon_mode2(cospi[54], cospi[10], x7[20], x7[27], output[5], output[27],
- v_cos_bit);
- btf_16_neon_mode2(cospi[22], cospi[42], x7[21], x7[26], output[21],
- output[11], v_cos_bit);
- btf_16_neon_mode2(cospi[38], cospi[26], x7[22], x7[25], output[13],
- output[19], v_cos_bit);
- btf_16_neon_mode2(cospi[6], cospi[58], x7[23], x7[24], output[29], output[3],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
+ &output[30]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
+ &output[14]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
+ &output[22]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
+ &output[6]);
+ butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
+ butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
+ butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
+ butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
+
+ butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
+ &output[31]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
+ &output[15]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
+ &output[23]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
+ &output[27]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
+ &output[11]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
+ &output[19]);
+ butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
+ &output[3]);
}
-void av1_fdct8x64_stage_1234_neon(const int16x8_t *input, int16x8_t *x3,
- int16x8_t *x4, const int32_t *cospi32,
- const int32x4_t *v_cos_bit) {
+static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+ const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+ const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+ const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+ const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+ const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+ const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+ const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+ const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+ const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+ const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+ const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+ const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+ const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+ const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+ const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+ const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+ const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+ const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+ const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+ const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+ const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+ const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+ const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+ const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+ // stage 1
int16x8_t x1[64];
+ butterfly_dct_pre_s16_x8(input, x1, 64);
+
+ // stage 2
int16x8_t x2[64];
- x1[0] = vqaddq_s16(input[0], input[63]);
- x1[63] = vqsubq_s16(input[0], input[63]);
- x1[1] = vqaddq_s16(input[1], input[62]);
- x1[62] = vqsubq_s16(input[1], input[62]);
- x1[2] = vqaddq_s16(input[2], input[61]);
- x1[61] = vqsubq_s16(input[2], input[61]);
- x1[3] = vqaddq_s16(input[3], input[60]);
- x1[60] = vqsubq_s16(input[3], input[60]);
- x1[4] = vqaddq_s16(input[4], input[59]);
- x1[59] = vqsubq_s16(input[4], input[59]);
- x1[5] = vqaddq_s16(input[5], input[58]);
- x1[58] = vqsubq_s16(input[5], input[58]);
- x1[6] = vqaddq_s16(input[6], input[57]);
- x1[57] = vqsubq_s16(input[6], input[57]);
- x1[7] = vqaddq_s16(input[7], input[56]);
- x1[56] = vqsubq_s16(input[7], input[56]);
- x1[8] = vqaddq_s16(input[8], input[55]);
- x1[55] = vqsubq_s16(input[8], input[55]);
- x1[9] = vqaddq_s16(input[9], input[54]);
- x1[54] = vqsubq_s16(input[9], input[54]);
- x1[10] = vqaddq_s16(input[10], input[53]);
- x1[53] = vqsubq_s16(input[10], input[53]);
- x1[11] = vqaddq_s16(input[11], input[52]);
- x1[52] = vqsubq_s16(input[11], input[52]);
- x1[12] = vqaddq_s16(input[12], input[51]);
- x1[51] = vqsubq_s16(input[12], input[51]);
- x1[13] = vqaddq_s16(input[13], input[50]);
- x1[50] = vqsubq_s16(input[13], input[50]);
- x1[14] = vqaddq_s16(input[14], input[49]);
- x1[49] = vqsubq_s16(input[14], input[49]);
- x1[15] = vqaddq_s16(input[15], input[48]);
- x1[48] = vqsubq_s16(input[15], input[48]);
- x1[16] = vqaddq_s16(input[16], input[47]);
- x1[47] = vqsubq_s16(input[16], input[47]);
- x1[17] = vqaddq_s16(input[17], input[46]);
- x1[46] = vqsubq_s16(input[17], input[46]);
- x1[18] = vqaddq_s16(input[18], input[45]);
- x1[45] = vqsubq_s16(input[18], input[45]);
- x1[19] = vqaddq_s16(input[19], input[44]);
- x1[44] = vqsubq_s16(input[19], input[44]);
- x1[20] = vqaddq_s16(input[20], input[43]);
- x1[43] = vqsubq_s16(input[20], input[43]);
- x1[21] = vqaddq_s16(input[21], input[42]);
- x1[42] = vqsubq_s16(input[21], input[42]);
- x1[22] = vqaddq_s16(input[22], input[41]);
- x1[41] = vqsubq_s16(input[22], input[41]);
- x1[23] = vqaddq_s16(input[23], input[40]);
- x1[40] = vqsubq_s16(input[23], input[40]);
- x1[24] = vqaddq_s16(input[24], input[39]);
- x1[39] = vqsubq_s16(input[24], input[39]);
- x1[25] = vqaddq_s16(input[25], input[38]);
- x1[38] = vqsubq_s16(input[25], input[38]);
- x1[26] = vqaddq_s16(input[26], input[37]);
- x1[37] = vqsubq_s16(input[26], input[37]);
- x1[27] = vqaddq_s16(input[27], input[36]);
- x1[36] = vqsubq_s16(input[27], input[36]);
- x1[28] = vqaddq_s16(input[28], input[35]);
- x1[35] = vqsubq_s16(input[28], input[35]);
- x1[29] = vqaddq_s16(input[29], input[34]);
- x1[34] = vqsubq_s16(input[29], input[34]);
- x1[30] = vqaddq_s16(input[30], input[33]);
- x1[33] = vqsubq_s16(input[30], input[33]);
- x1[31] = vqaddq_s16(input[31], input[32]);
- x1[32] = vqsubq_s16(input[31], input[32]);
-
- x2[0] = vqaddq_s16(x1[0], x1[31]);
- x2[31] = vqsubq_s16(x1[0], x1[31]);
- x2[1] = vqaddq_s16(x1[1], x1[30]);
- x2[30] = vqsubq_s16(x1[1], x1[30]);
- x2[2] = vqaddq_s16(x1[2], x1[29]);
- x2[29] = vqsubq_s16(x1[2], x1[29]);
- x2[3] = vqaddq_s16(x1[3], x1[28]);
- x2[28] = vqsubq_s16(x1[3], x1[28]);
- x2[4] = vqaddq_s16(x1[4], x1[27]);
- x2[27] = vqsubq_s16(x1[4], x1[27]);
- x2[5] = vqaddq_s16(x1[5], x1[26]);
- x2[26] = vqsubq_s16(x1[5], x1[26]);
- x2[6] = vqaddq_s16(x1[6], x1[25]);
- x2[25] = vqsubq_s16(x1[6], x1[25]);
- x2[7] = vqaddq_s16(x1[7], x1[24]);
- x2[24] = vqsubq_s16(x1[7], x1[24]);
- x2[8] = vqaddq_s16(x1[8], x1[23]);
- x2[23] = vqsubq_s16(x1[8], x1[23]);
- x2[9] = vqaddq_s16(x1[9], x1[22]);
- x2[22] = vqsubq_s16(x1[9], x1[22]);
- x2[10] = vqaddq_s16(x1[10], x1[21]);
- x2[21] = vqsubq_s16(x1[10], x1[21]);
- x2[11] = vqaddq_s16(x1[11], x1[20]);
- x2[20] = vqsubq_s16(x1[11], x1[20]);
- x2[12] = vqaddq_s16(x1[12], x1[19]);
- x2[19] = vqsubq_s16(x1[12], x1[19]);
- x2[13] = vqaddq_s16(x1[13], x1[18]);
- x2[18] = vqsubq_s16(x1[13], x1[18]);
- x2[14] = vqaddq_s16(x1[14], x1[17]);
- x2[17] = vqsubq_s16(x1[14], x1[17]);
- x2[15] = vqaddq_s16(x1[15], x1[16]);
- x2[16] = vqsubq_s16(x1[15], x1[16]);
-
- btf_16_neon_mode0(*cospi32, *cospi32, x1[40], x1[55], x2[40], x2[55],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x1[41], x1[54], x2[41], x2[54],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x1[42], x1[53], x2[42], x2[53],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x1[43], x1[52], x2[43], x2[52],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x1[44], x1[51], x2[44], x2[51],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x1[45], x1[50], x2[45], x2[50],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x1[46], x1[49], x2[46], x2[49],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x1[47], x1[48], x2[47], x2[48],
- *v_cos_bit);
+ butterfly_dct_pre_s16_x8(x1, x2, 32);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
// stage 3
- x3[0] = vqaddq_s16(x2[0], x2[15]);
- x3[15] = vqsubq_s16(x2[0], x2[15]);
- x3[1] = vqaddq_s16(x2[1], x2[14]);
- x3[14] = vqsubq_s16(x2[1], x2[14]);
- x3[2] = vqaddq_s16(x2[2], x2[13]);
- x3[13] = vqsubq_s16(x2[2], x2[13]);
- x3[3] = vqaddq_s16(x2[3], x2[12]);
- x3[12] = vqsubq_s16(x2[3], x2[12]);
- x3[4] = vqaddq_s16(x2[4], x2[11]);
- x3[11] = vqsubq_s16(x2[4], x2[11]);
- x3[5] = vqaddq_s16(x2[5], x2[10]);
- x3[10] = vqsubq_s16(x2[5], x2[10]);
- x3[6] = vqaddq_s16(x2[6], x2[9]);
- x3[9] = vqsubq_s16(x2[6], x2[9]);
- x3[7] = vqaddq_s16(x2[7], x2[8]);
- x3[8] = vqsubq_s16(x2[7], x2[8]);
+ int16x8_t x3[64];
+ butterfly_dct_pre_s16_x8(x2, x3, 16);
x3[16] = x2[16];
x3[17] = x2[17];
x3[18] = x2[18];
x3[19] = x2[19];
- btf_16_neon_mode0(*cospi32, *cospi32, x2[20], x2[27], x3[20], x3[27],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x2[21], x2[26], x3[21], x3[26],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x2[22], x2[25], x3[22], x3[25],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x2[23], x2[24], x3[23], x3[24],
- *v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
x3[28] = x2[28];
x3[29] = x2[29];
x3[30] = x2[30];
x3[31] = x2[31];
- x3[32] = vqaddq_s16(x1[32], x2[47]);
- x3[47] = vqsubq_s16(x1[32], x2[47]);
- x3[33] = vqaddq_s16(x1[33], x2[46]);
- x3[46] = vqsubq_s16(x1[33], x2[46]);
- x3[34] = vqaddq_s16(x1[34], x2[45]);
- x3[45] = vqsubq_s16(x1[34], x2[45]);
- x3[35] = vqaddq_s16(x1[35], x2[44]);
- x3[44] = vqsubq_s16(x1[35], x2[44]);
- x3[36] = vqaddq_s16(x1[36], x2[43]);
- x3[43] = vqsubq_s16(x1[36], x2[43]);
- x3[37] = vqaddq_s16(x1[37], x2[42]);
- x3[42] = vqsubq_s16(x1[37], x2[42]);
- x3[38] = vqaddq_s16(x1[38], x2[41]);
- x3[41] = vqsubq_s16(x1[38], x2[41]);
- x3[39] = vqaddq_s16(x1[39], x2[40]);
- x3[40] = vqsubq_s16(x1[39], x2[40]);
- x3[48] = vqsubq_s16(x1[63], x2[48]);
- x3[63] = vqaddq_s16(x1[63], x2[48]);
- x3[49] = vqsubq_s16(x1[62], x2[49]);
- x3[62] = vqaddq_s16(x1[62], x2[49]);
- x3[50] = vqsubq_s16(x1[61], x2[50]);
- x3[61] = vqaddq_s16(x1[61], x2[50]);
- x3[51] = vqsubq_s16(x1[60], x2[51]);
- x3[60] = vqaddq_s16(x1[60], x2[51]);
- x3[52] = vqsubq_s16(x1[59], x2[52]);
- x3[59] = vqaddq_s16(x1[59], x2[52]);
- x3[53] = vqsubq_s16(x1[58], x2[53]);
- x3[58] = vqaddq_s16(x1[58], x2[53]);
- x3[54] = vqsubq_s16(x1[57], x2[54]);
- x3[57] = vqaddq_s16(x1[57], x2[54]);
- x3[55] = vqsubq_s16(x1[56], x2[55]);
- x3[56] = vqaddq_s16(x1[56], x2[55]);
+ butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
// stage 4
- x4[0] = vqaddq_s16(x3[0], x3[7]);
- x4[7] = vqsubq_s16(x3[0], x3[7]);
- x4[1] = vqaddq_s16(x3[1], x3[6]);
- x4[6] = vqsubq_s16(x3[1], x3[6]);
- x4[2] = vqaddq_s16(x3[2], x3[5]);
- x4[5] = vqsubq_s16(x3[2], x3[5]);
- x4[3] = vqaddq_s16(x3[3], x3[4]);
- x4[4] = vqsubq_s16(x3[3], x3[4]);
-
- btf_16_neon_mode0(*cospi32, *cospi32, x3[10], x3[13], x4[10], x4[13],
- *v_cos_bit);
- btf_16_neon_mode0(*cospi32, *cospi32, x3[11], x3[12], x4[11], x4[12],
- *v_cos_bit);
-
- x4[16] = vqaddq_s16(x3[16], x3[23]);
- x4[23] = vqsubq_s16(x3[16], x3[23]);
- x4[17] = vqaddq_s16(x3[17], x3[22]);
- x4[22] = vqsubq_s16(x3[17], x3[22]);
- x4[18] = vqaddq_s16(x3[18], x3[21]);
- x4[21] = vqsubq_s16(x3[18], x3[21]);
- x4[19] = vqaddq_s16(x3[19], x3[20]);
- x4[20] = vqsubq_s16(x3[19], x3[20]);
- x4[24] = vqsubq_s16(x3[31], x3[24]);
- x4[31] = vqaddq_s16(x3[31], x3[24]);
- x4[25] = vqsubq_s16(x3[30], x3[25]);
- x4[30] = vqaddq_s16(x3[30], x3[25]);
- x4[26] = vqsubq_s16(x3[29], x3[26]);
- x4[29] = vqaddq_s16(x3[29], x3[26]);
- x4[27] = vqsubq_s16(x3[28], x3[27]);
- x4[28] = vqaddq_s16(x3[28], x3[27]);
-}
-
-void av1_fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
- int16x8_t x3[64];
int16x8_t x4[64];
-
- av1_fdct8x64_stage_1234_neon(input, x3, x4, &cospi[32], &v_cos_bit);
-
- btf_16_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
- v_cos_bit);
- btf_16_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
- v_cos_bit);
- btf_16_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
- v_cos_bit);
- btf_16_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
- v_cos_bit);
+ butterfly_dct_pre_s16_x8(x3, x4, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+ butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
// stage 5
int16x8_t x5[64];
- x5[0] = vqaddq_s16(x4[0], x4[3]);
- x5[3] = vqsubq_s16(x4[0], x4[3]);
- x5[1] = vqaddq_s16(x4[1], x4[2]);
- x5[2] = vqsubq_s16(x4[1], x4[2]);
-
- btf_16_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
- v_cos_bit);
-
- x5[8] = vqaddq_s16(x3[8], x4[11]);
- x5[11] = vqsubq_s16(x3[8], x4[11]);
- x5[9] = vqaddq_s16(x3[9], x4[10]);
- x5[10] = vqsubq_s16(x3[9], x4[10]);
- x5[12] = vqsubq_s16(x3[15], x4[12]);
- x5[15] = vqaddq_s16(x3[15], x4[12]);
- x5[13] = vqsubq_s16(x3[14], x4[13]);
- x5[14] = vqaddq_s16(x3[14], x4[13]);
-
- btf_16_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
- v_cos_bit);
- btf_16_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
- v_cos_bit);
-
- x5[32] = vqaddq_s16(x3[32], x4[39]);
- x5[39] = vqsubq_s16(x3[32], x4[39]);
- x5[33] = vqaddq_s16(x3[33], x4[38]);
- x5[38] = vqsubq_s16(x3[33], x4[38]);
- x5[34] = vqaddq_s16(x3[34], x4[37]);
- x5[37] = vqsubq_s16(x3[34], x4[37]);
- x5[35] = vqaddq_s16(x3[35], x4[36]);
- x5[36] = vqsubq_s16(x3[35], x4[36]);
- x5[40] = vqsubq_s16(x3[47], x4[40]);
- x5[47] = vqaddq_s16(x3[47], x4[40]);
- x5[41] = vqsubq_s16(x3[46], x4[41]);
- x5[46] = vqaddq_s16(x3[46], x4[41]);
- x5[42] = vqsubq_s16(x3[45], x4[42]);
- x5[45] = vqaddq_s16(x3[45], x4[42]);
- x5[43] = vqsubq_s16(x3[44], x4[43]);
- x5[44] = vqaddq_s16(x3[44], x4[43]);
- x5[48] = vqaddq_s16(x3[48], x4[55]);
- x5[55] = vqsubq_s16(x3[48], x4[55]);
- x5[49] = vqaddq_s16(x3[49], x4[54]);
- x5[54] = vqsubq_s16(x3[49], x4[54]);
- x5[50] = vqaddq_s16(x3[50], x4[53]);
- x5[53] = vqsubq_s16(x3[50], x4[53]);
- x5[51] = vqaddq_s16(x3[51], x4[52]);
- x5[52] = vqsubq_s16(x3[51], x4[52]);
- x5[56] = vqsubq_s16(x3[63], x4[56]);
- x5[63] = vqaddq_s16(x3[63], x4[56]);
- x5[57] = vqsubq_s16(x3[62], x4[57]);
- x5[62] = vqaddq_s16(x3[62], x4[57]);
- x5[58] = vqsubq_s16(x3[61], x4[58]);
- x5[61] = vqaddq_s16(x3[61], x4[58]);
- x5[59] = vqsubq_s16(x3[60], x4[59]);
- x5[60] = vqaddq_s16(x3[60], x4[59]);
+ butterfly_dct_pre_s16_x8(x4, x5, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+ butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+ butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
// stage 6
int16x8_t x6[64];
- btf_16_neon_mode2(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
- v_cos_bit);
- btf_16_neon_mode2(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
- v_cos_bit);
- x6[4] = vqaddq_s16(x4[4], x5[5]);
- x6[5] = vqsubq_s16(x4[4], x5[5]);
- x6[6] = vqsubq_s16(x4[7], x5[6]);
- x6[7] = vqaddq_s16(x4[7], x5[6]);
-
- btf_16_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
- v_cos_bit);
- btf_16_neon_mode02(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
- v_cos_bit);
-
- x6[16] = vqaddq_s16(x4[16], x5[19]);
- x6[19] = vqsubq_s16(x4[16], x5[19]);
- x6[17] = vqaddq_s16(x4[17], x5[18]);
- x6[18] = vqsubq_s16(x4[17], x5[18]);
- x6[20] = vqsubq_s16(x4[23], x5[20]);
- x6[23] = vqaddq_s16(x4[23], x5[20]);
- x6[21] = vqsubq_s16(x4[22], x5[21]);
- x6[22] = vqaddq_s16(x4[22], x5[21]);
- x6[24] = vqaddq_s16(x4[24], x5[27]);
- x6[27] = vqsubq_s16(x4[24], x5[27]);
- x6[25] = vqaddq_s16(x4[25], x5[26]);
- x6[26] = vqsubq_s16(x4[25], x5[26]);
- x6[28] = vqsubq_s16(x4[31], x5[28]);
- x6[31] = vqaddq_s16(x4[31], x5[28]);
- x6[29] = vqsubq_s16(x4[30], x5[29]);
- x6[30] = vqaddq_s16(x4[30], x5[29]);
-
- btf_16_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
- v_cos_bit);
- btf_16_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
- v_cos_bit);
- btf_16_neon_mode02(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
- v_cos_bit);
- btf_16_neon_mode02(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
- v_cos_bit);
- btf_16_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
- v_cos_bit);
- btf_16_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
- v_cos_bit);
- btf_16_neon_mode02(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
- v_cos_bit);
- btf_16_neon_mode02(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+ butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+ butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
// stage 7
int16x8_t x7[64];
-
- btf_16_neon_mode2(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
- btf_16_neon_mode2(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
- v_cos_bit);
- x7[8] = vqaddq_s16(x5[8], x6[9]);
- x7[9] = vqsubq_s16(x5[8], x6[9]);
- x7[10] = vqsubq_s16(x5[11], x6[10]);
- x7[11] = vqaddq_s16(x5[11], x6[10]);
- x7[12] = vqaddq_s16(x5[12], x6[13]);
- x7[13] = vqsubq_s16(x5[12], x6[13]);
- x7[14] = vqsubq_s16(x5[15], x6[14]);
- x7[15] = vqaddq_s16(x5[15], x6[14]);
-
- btf_16_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
- v_cos_bit);
- btf_16_neon_mode02(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
- v_cos_bit);
-
- btf_16_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
- v_cos_bit);
- btf_16_neon_mode02(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
- v_cos_bit);
-
- x7[32] = vqaddq_s16(x5[32], x6[35]);
- x7[35] = vqsubq_s16(x5[32], x6[35]);
- x7[33] = vqaddq_s16(x5[33], x6[34]);
- x7[34] = vqsubq_s16(x5[33], x6[34]);
- x7[36] = vqsubq_s16(x5[39], x6[36]);
- x7[39] = vqaddq_s16(x5[39], x6[36]);
- x7[37] = vqsubq_s16(x5[38], x6[37]);
- x7[38] = vqaddq_s16(x5[38], x6[37]);
- x7[40] = vqaddq_s16(x5[40], x6[43]);
- x7[43] = vqsubq_s16(x5[40], x6[43]);
- x7[41] = vqaddq_s16(x5[41], x6[42]);
- x7[42] = vqsubq_s16(x5[41], x6[42]);
- x7[44] = vqsubq_s16(x5[47], x6[44]);
- x7[47] = vqaddq_s16(x5[47], x6[44]);
- x7[45] = vqsubq_s16(x5[46], x6[45]);
- x7[46] = vqaddq_s16(x5[46], x6[45]);
- x7[48] = vqaddq_s16(x5[48], x6[51]);
- x7[51] = vqsubq_s16(x5[48], x6[51]);
- x7[49] = vqaddq_s16(x5[49], x6[50]);
- x7[50] = vqsubq_s16(x5[49], x6[50]);
- x7[52] = vqsubq_s16(x5[55], x6[52]);
- x7[55] = vqaddq_s16(x5[55], x6[52]);
- x7[53] = vqsubq_s16(x5[54], x6[53]);
- x7[54] = vqaddq_s16(x5[54], x6[53]);
- x7[56] = vqaddq_s16(x5[56], x6[59]);
- x7[59] = vqsubq_s16(x5[56], x6[59]);
- x7[57] = vqaddq_s16(x5[57], x6[58]);
- x7[58] = vqsubq_s16(x5[57], x6[58]);
- x7[60] = vqsubq_s16(x5[63], x6[60]);
- x7[63] = vqaddq_s16(x5[63], x6[60]);
- x7[61] = vqsubq_s16(x5[62], x6[61]);
- x7[62] = vqaddq_s16(x5[62], x6[61]);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+ butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+ butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
// stage 8
int16x8_t x8[64];
-
- btf_16_neon_mode2(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
- v_cos_bit);
- btf_16_neon_mode2(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
- v_cos_bit);
- btf_16_neon_mode2(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
- v_cos_bit);
- btf_16_neon_mode2(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
- v_cos_bit);
- x8[16] = vqaddq_s16(x6[16], x7[17]);
- x8[17] = vqsubq_s16(x6[16], x7[17]);
- x8[18] = vqsubq_s16(x6[19], x7[18]);
- x8[19] = vqaddq_s16(x6[19], x7[18]);
- x8[20] = vqaddq_s16(x6[20], x7[21]);
- x8[21] = vqsubq_s16(x6[20], x7[21]);
- x8[22] = vqsubq_s16(x6[23], x7[22]);
- x8[23] = vqaddq_s16(x6[23], x7[22]);
- x8[24] = vqaddq_s16(x6[24], x7[25]);
- x8[25] = vqsubq_s16(x6[24], x7[25]);
- x8[26] = vqsubq_s16(x6[27], x7[26]);
- x8[27] = vqaddq_s16(x6[27], x7[26]);
- x8[28] = vqaddq_s16(x6[28], x7[29]);
- x8[29] = vqsubq_s16(x6[28], x7[29]);
- x8[30] = vqsubq_s16(x6[31], x7[30]);
- x8[31] = vqaddq_s16(x6[31], x7[30]);
-
- btf_16_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
- v_cos_bit);
- btf_16_neon_mode02(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
- v_cos_bit);
- btf_16_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
- v_cos_bit);
- btf_16_neon_mode02(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
- v_cos_bit);
- btf_16_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
- v_cos_bit);
- btf_16_neon_mode02(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
- v_cos_bit);
- btf_16_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
- v_cos_bit);
- btf_16_neon_mode02(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+ butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+ butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+ butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+ butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+ butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
// stage 9
int16x8_t x9[64];
-
- btf_16_neon_mode2(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
- v_cos_bit);
- btf_16_neon_mode2(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
- v_cos_bit);
- btf_16_neon_mode2(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
- v_cos_bit);
- btf_16_neon_mode2(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
- v_cos_bit);
- btf_16_neon_mode2(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
- v_cos_bit);
- btf_16_neon_mode2(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
- v_cos_bit);
- btf_16_neon_mode2(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
- v_cos_bit);
- btf_16_neon_mode2(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
- v_cos_bit);
- x9[32] = vqaddq_s16(x7[32], x8[33]);
- x9[33] = vqsubq_s16(x7[32], x8[33]);
- x9[34] = vqsubq_s16(x7[35], x8[34]);
- x9[35] = vqaddq_s16(x7[35], x8[34]);
- x9[36] = vqaddq_s16(x7[36], x8[37]);
- x9[37] = vqsubq_s16(x7[36], x8[37]);
- x9[38] = vqsubq_s16(x7[39], x8[38]);
- x9[39] = vqaddq_s16(x7[39], x8[38]);
- x9[40] = vqaddq_s16(x7[40], x8[41]);
- x9[41] = vqsubq_s16(x7[40], x8[41]);
- x9[42] = vqsubq_s16(x7[43], x8[42]);
- x9[43] = vqaddq_s16(x7[43], x8[42]);
- x9[44] = vqaddq_s16(x7[44], x8[45]);
- x9[45] = vqsubq_s16(x7[44], x8[45]);
- x9[46] = vqsubq_s16(x7[47], x8[46]);
- x9[47] = vqaddq_s16(x7[47], x8[46]);
- x9[48] = vqaddq_s16(x7[48], x8[49]);
- x9[49] = vqsubq_s16(x7[48], x8[49]);
- x9[50] = vqsubq_s16(x7[51], x8[50]);
- x9[51] = vqaddq_s16(x7[51], x8[50]);
- x9[52] = vqaddq_s16(x7[52], x8[53]);
- x9[53] = vqsubq_s16(x7[52], x8[53]);
- x9[54] = vqsubq_s16(x7[55], x8[54]);
- x9[55] = vqaddq_s16(x7[55], x8[54]);
- x9[56] = vqaddq_s16(x7[56], x8[57]);
- x9[57] = vqsubq_s16(x7[56], x8[57]);
- x9[58] = vqsubq_s16(x7[59], x8[58]);
- x9[59] = vqaddq_s16(x7[59], x8[58]);
- x9[60] = vqaddq_s16(x7[60], x8[61]);
- x9[61] = vqsubq_s16(x7[60], x8[61]);
- x9[62] = vqsubq_s16(x7[63], x8[62]);
- x9[63] = vqaddq_s16(x7[63], x8[62]);
+ butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+ butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+ butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
// stage 10
- btf_16_neon_mode2(cospi[63], cospi[1], x9[32], x9[63], output[1], output[63],
- v_cos_bit);
-
- btf_16_neon_mode2(cospi[31], cospi[33], x9[33], x9[62], output[33],
- output[31], v_cos_bit);
-
- btf_16_neon_mode2(cospi[47], cospi[17], x9[34], x9[61], output[17],
- output[47], v_cos_bit);
-
- btf_16_neon_mode2(cospi[15], cospi[49], x9[35], x9[60], output[49],
- output[15], v_cos_bit);
-
- btf_16_neon_mode2(cospi[55], cospi[9], x9[36], x9[59], output[9], output[55],
- v_cos_bit);
-
- btf_16_neon_mode2(cospi[23], cospi[41], x9[37], x9[58], output[41],
- output[23], v_cos_bit);
-
- btf_16_neon_mode2(cospi[39], cospi[25], x9[38], x9[57], output[25],
- output[39], v_cos_bit);
-
- btf_16_neon_mode2(cospi[7], cospi[57], x9[39], x9[56], output[57], output[7],
- v_cos_bit);
-
- btf_16_neon_mode2(cospi[59], cospi[5], x9[40], x9[55], output[5], output[59],
- v_cos_bit);
-
- btf_16_neon_mode2(cospi[27], cospi[37], x9[41], x9[54], output[37],
- output[27], v_cos_bit);
-
- btf_16_neon_mode2(cospi[43], cospi[21], x9[42], x9[53], output[21],
- output[43], v_cos_bit);
-
- btf_16_neon_mode2(cospi[11], cospi[53], x9[43], x9[52], output[53],
- output[11], v_cos_bit);
-
- btf_16_neon_mode2(cospi[51], cospi[13], x9[44], x9[51], output[13],
- output[51], v_cos_bit);
-
- btf_16_neon_mode2(cospi[19], cospi[45], x9[45], x9[50], output[45],
- output[19], v_cos_bit);
-
- btf_16_neon_mode2(cospi[35], cospi[29], x9[46], x9[49], output[29],
- output[35], v_cos_bit);
-
- btf_16_neon_mode2(cospi[3], cospi[61], x9[47], x9[48], output[61], output[3],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
+ &output[63]);
+ butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
+ &output[31]);
+ butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
+ &output[47]);
+ butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
+ &output[15]);
+ butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
+ &output[55]);
+ butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
+ &output[23]);
+ butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
+ &output[39]);
+ butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
+ &output[59]);
+ butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
+ &output[27]);
+ butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
+ &output[43]);
+ butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
+ &output[11]);
+ butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
+ &output[51]);
+ butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
+ &output[19]);
+ butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
+ &output[35]);
+ butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
+ &output[3]);
// stage 11
output[0] = x6[0];
@@ -1823,1377 +1168,1297 @@ void av1_fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
output[62] = x9[31];
}
-void fadst_8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
- // stage 1
- int16x8_t x1[4];
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
- x1[0] = vqnegq_s16(input[7]);
- x1[1] = vqnegq_s16(input[3]);
- x1[2] = vqnegq_s16(input[1]);
- x1[3] = vqnegq_s16(input[5]);
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
// stage 2
int16x8_t x2[8];
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
- btf_16_neon_mode3(cospi[32], cospi[32], x1[1], input[4], x2[2], x2[3],
- v_cos_bit);
- btf_16_neon_mode3(cospi[32], cospi[32], input[2], x1[3], x2[6], x2[7],
- v_cos_bit);
// stage 3
int16x8_t x3[8];
x3[0] = vqaddq_s16(input[0], x2[2]);
+ x3[1] = vqsubq_s16(x2[3], input[7]);
x3[2] = vqsubq_s16(input[0], x2[2]);
- x3[1] = vqaddq_s16(x1[0], x2[3]);
- x3[3] = vqsubq_s16(x1[0], x2[3]);
- x3[4] = vqaddq_s16(x1[2], x2[6]);
- x3[6] = vqsubq_s16(x1[2], x2[6]);
+ x3[3] = vqaddq_s16(input[7], x2[3]);
+ x3[4] = vqsubq_s16(x2[6], input[1]);
x3[5] = vqaddq_s16(input[6], x2[7]);
+ x3[6] = vqaddq_s16(input[1], x2[6]);
x3[7] = vqsubq_s16(input[6], x2[7]);
// stage 4
- btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
- v_cos_bit);
- btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
// stage 5
int16x8_t x5[8];
x5[0] = vqaddq_s16(x3[0], x3[4]);
- x5[4] = vqsubq_s16(x3[0], x3[4]);
x5[1] = vqaddq_s16(x3[1], x3[5]);
- x5[5] = vqsubq_s16(x3[1], x3[5]);
x5[2] = vqaddq_s16(x3[2], x3[6]);
+ x5[3] = vqsubq_s16(x3[7], x3[3]);
+ x5[4] = vqsubq_s16(x3[0], x3[4]);
+ x5[5] = vqsubq_s16(x3[1], x3[5]);
x5[6] = vqsubq_s16(x3[2], x3[6]);
- x5[3] = vqaddq_s16(x3[3], x3[7]);
- x5[7] = vqsubq_s16(x3[3], x3[7]);
+ x5[7] = vqaddq_s16(x3[3], x3[7]);
// stage 6
- btf_16_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
- v_cos_bit);
- btf_16_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
- v_cos_bit);
- btf_16_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
- v_cos_bit);
- btf_16_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
}
-static void fadst8x16_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
- // stage 1
- int16x8_t x1[12];
- x1[0] = vqnegq_s16(input[15]);
- x1[1] = vqnegq_s16(input[3]);
- x1[2] = vqnegq_s16(input[1]);
- x1[3] = vqnegq_s16(input[13]);
+ // stage 2
+ int16x4_t x2[8];
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+ // stage 3
+ int16x4_t x3[16];
+ x3[0] = vqadd_s16(input[0], x2[0]);
+ x3[1] = vqsub_s16(x2[1], input[15]);
+ x3[2] = vqsub_s16(input[0], x2[0]);
+ x3[3] = vqadd_s16(input[15], x2[1]);
+ x3[4] = vqsub_s16(x2[2], input[3]);
+ x3[5] = vqadd_s16(input[12], x2[3]);
+ x3[6] = vqadd_s16(input[3], x2[2]);
+ x3[7] = vqsub_s16(input[12], x2[3]);
+ x3[8] = vqsub_s16(x2[4], input[1]);
+ x3[9] = vqadd_s16(input[14], x2[5]);
+ x3[10] = vqadd_s16(input[1], x2[4]);
+ x3[11] = vqsub_s16(input[14], x2[5]);
+ x3[12] = vqadd_s16(input[2], x2[6]);
+ x3[13] = vqsub_s16(x2[7], input[13]);
+ x3[14] = vqsub_s16(input[2], x2[6]);
+ x3[15] = vqadd_s16(input[13], x2[7]);
+
+ // stage 4
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+ butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+ // stage 5
+ int16x4_t x5[16];
+ x5[0] = vqadd_s16(x3[0], x3[4]);
+ x5[1] = vqadd_s16(x3[1], x3[5]);
+ x5[2] = vqadd_s16(x3[2], x3[6]);
+ x5[3] = vqsub_s16(x3[7], x3[3]);
+ x5[4] = vqsub_s16(x3[0], x3[4]);
+ x5[5] = vqsub_s16(x3[1], x3[5]);
+ x5[6] = vqsub_s16(x3[2], x3[6]);
+ x5[7] = vqadd_s16(x3[3], x3[7]);
+ x5[8] = vqadd_s16(x3[8], x3[12]);
+ x5[9] = vqadd_s16(x3[9], x3[13]);
+ x5[10] = vqsub_s16(x3[14], x3[10]);
+ x5[11] = vqadd_s16(x3[11], x3[15]);
+ x5[12] = vqsub_s16(x3[8], x3[12]);
+ x5[13] = vqsub_s16(x3[9], x3[13]);
+ x5[14] = vqadd_s16(x3[10], x3[14]);
+ x5[15] = vqsub_s16(x3[11], x3[15]);
+
+ // stage 6
+ butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+ butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+ // stage 7
+ int16x4_t x7[16];
+ x7[0] = vqadd_s16(x5[0], x5[8]);
+ x7[1] = vqadd_s16(x5[1], x5[9]);
+ x7[2] = vqadd_s16(x5[2], x5[10]);
+ x7[3] = vqadd_s16(x5[3], x5[11]);
+ x7[4] = vqadd_s16(x5[4], x5[12]);
+ x7[5] = vqadd_s16(x5[5], x5[13]);
+ x7[6] = vqadd_s16(x5[6], x5[14]);
+ x7[7] = vqsub_s16(x5[15], x5[7]);
+ x7[8] = vqsub_s16(x5[0], x5[8]);
+ x7[9] = vqsub_s16(x5[1], x5[9]);
+ x7[10] = vqsub_s16(x5[2], x5[10]);
+ x7[11] = vqsub_s16(x5[3], x5[11]);
+ x7[12] = vqsub_s16(x5[4], x5[12]);
+ x7[13] = vqsub_s16(x5[5], x5[13]);
+ x7[14] = vqsub_s16(x5[6], x5[14]);
+ x7[15] = vqadd_s16(x5[7], x5[15]);
+
+ // stage 8
+ butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+ butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
+ &output[2]);
+ butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
+ &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+ butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+ butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
+ &output[10]);
+ butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
+ &output[12]);
+ butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
+ &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
// stage 2
- btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[7], input[8],
- x1[4], x1[5]);
- btf_16_neon_mode1(cospi[32], cospi[32], input[4], input[11], x1[6], x1[7],
- v_cos_bit);
- btf_16_neon_mode1(cospi[32], cospi[32], input[6], input[9], x1[8], x1[9],
- v_cos_bit);
- btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[5],
- input[10], x1[10], x1[11]);
+ int16x8_t x2[8];
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
// stage 3
int16x8_t x3[16];
- x3[0] = vqaddq_s16(input[0], x1[4]);
- x3[2] = vqsubq_s16(input[0], x1[4]);
- x3[1] = vqaddq_s16(x1[0], x1[5]);
- x3[3] = vqsubq_s16(x1[0], x1[5]);
- x3[4] = vqaddq_s16(x1[1], x1[6]);
- x3[6] = vqsubq_s16(x1[1], x1[6]);
- x3[5] = vqaddq_s16(input[12], x1[7]);
- x3[7] = vqsubq_s16(input[12], x1[7]);
- x3[8] = vqaddq_s16(x1[2], x1[8]);
- x3[10] = vqsubq_s16(x1[2], x1[8]);
- x3[9] = vqaddq_s16(input[14], x1[9]);
- x3[11] = vqsubq_s16(input[14], x1[9]);
- x3[12] = vqaddq_s16(input[2], x1[10]);
- x3[14] = vqsubq_s16(input[2], x1[10]);
- x3[13] = vqaddq_s16(x1[3], x1[11]);
- x3[15] = vqsubq_s16(x1[3], x1[11]);
+ x3[0] = vqaddq_s16(input[0], x2[0]);
+ x3[1] = vqsubq_s16(x2[1], input[15]);
+ x3[2] = vqsubq_s16(input[0], x2[0]);
+ x3[3] = vqaddq_s16(input[15], x2[1]);
+ x3[4] = vqsubq_s16(x2[2], input[3]);
+ x3[5] = vqaddq_s16(input[12], x2[3]);
+ x3[6] = vqaddq_s16(input[3], x2[2]);
+ x3[7] = vqsubq_s16(input[12], x2[3]);
+ x3[8] = vqsubq_s16(x2[4], input[1]);
+ x3[9] = vqaddq_s16(input[14], x2[5]);
+ x3[10] = vqaddq_s16(input[1], x2[4]);
+ x3[11] = vqsubq_s16(input[14], x2[5]);
+ x3[12] = vqaddq_s16(input[2], x2[6]);
+ x3[13] = vqsubq_s16(x2[7], input[13]);
+ x3[14] = vqsubq_s16(input[2], x2[6]);
+ x3[15] = vqaddq_s16(input[13], x2[7]);
// stage 4
- btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
- v_cos_bit);
- btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
- v_cos_bit);
- btf_16_neon_mode3(cospi[16], cospi[48], x3[12], x3[13], x3[12], x3[13],
- v_cos_bit);
- btf_16_neon_mode0(cospi[48], cospi[16], x3[14], x3[15], x3[14], x3[15],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+ butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
// stage 5
int16x8_t x5[16];
x5[0] = vqaddq_s16(x3[0], x3[4]);
- x5[4] = vqsubq_s16(x3[0], x3[4]);
x5[1] = vqaddq_s16(x3[1], x3[5]);
- x5[5] = vqsubq_s16(x3[1], x3[5]);
x5[2] = vqaddq_s16(x3[2], x3[6]);
+ x5[3] = vqsubq_s16(x3[7], x3[3]);
+ x5[4] = vqsubq_s16(x3[0], x3[4]);
+ x5[5] = vqsubq_s16(x3[1], x3[5]);
x5[6] = vqsubq_s16(x3[2], x3[6]);
- x5[3] = vqaddq_s16(x3[3], x3[7]);
- x5[7] = vqsubq_s16(x3[3], x3[7]);
+ x5[7] = vqaddq_s16(x3[3], x3[7]);
x5[8] = vqaddq_s16(x3[8], x3[12]);
- x5[12] = vqsubq_s16(x3[8], x3[12]);
x5[9] = vqaddq_s16(x3[9], x3[13]);
- x5[13] = vqsubq_s16(x3[9], x3[13]);
- x5[10] = vqaddq_s16(x3[10], x3[14]);
- x5[14] = vqsubq_s16(x3[10], x3[14]);
+ x5[10] = vqsubq_s16(x3[14], x3[10]);
x5[11] = vqaddq_s16(x3[11], x3[15]);
+ x5[12] = vqsubq_s16(x3[8], x3[12]);
+ x5[13] = vqsubq_s16(x3[9], x3[13]);
+ x5[14] = vqaddq_s16(x3[10], x3[14]);
x5[15] = vqsubq_s16(x3[11], x3[15]);
// stage 6
- btf_16_neon_mode3(cospi[8], cospi[56], x5[8], x5[9], x5[8], x5[9], v_cos_bit);
- btf_16_neon_mode3(cospi[40], cospi[24], x5[10], x5[11], x5[10], x5[11],
- v_cos_bit);
- btf_16_neon_mode0(cospi[56], cospi[8], x5[12], x5[13], x5[12], x5[13],
- v_cos_bit);
- btf_16_neon_mode0(cospi[24], cospi[40], x5[14], x5[15], x5[14], x5[15],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+ butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
// stage 7
int16x8_t x7[16];
x7[0] = vqaddq_s16(x5[0], x5[8]);
- x7[8] = vqsubq_s16(x5[0], x5[8]);
x7[1] = vqaddq_s16(x5[1], x5[9]);
- x7[9] = vqsubq_s16(x5[1], x5[9]);
x7[2] = vqaddq_s16(x5[2], x5[10]);
- x7[10] = vqsubq_s16(x5[2], x5[10]);
x7[3] = vqaddq_s16(x5[3], x5[11]);
- x7[11] = vqsubq_s16(x5[3], x5[11]);
x7[4] = vqaddq_s16(x5[4], x5[12]);
- x7[12] = vqsubq_s16(x5[4], x5[12]);
x7[5] = vqaddq_s16(x5[5], x5[13]);
- x7[13] = vqsubq_s16(x5[5], x5[13]);
x7[6] = vqaddq_s16(x5[6], x5[14]);
+ x7[7] = vqsubq_s16(x5[15], x5[7]);
+ x7[8] = vqsubq_s16(x5[0], x5[8]);
+ x7[9] = vqsubq_s16(x5[1], x5[9]);
+ x7[10] = vqsubq_s16(x5[2], x5[10]);
+ x7[11] = vqsubq_s16(x5[3], x5[11]);
+ x7[12] = vqsubq_s16(x5[4], x5[12]);
+ x7[13] = vqsubq_s16(x5[5], x5[13]);
x7[14] = vqsubq_s16(x5[6], x5[14]);
- x7[7] = vqaddq_s16(x5[7], x5[15]);
- x7[15] = vqsubq_s16(x5[7], x5[15]);
+ x7[15] = vqaddq_s16(x5[7], x5[15]);
// stage 8
- btf_16_neon_mode3(cospi[2], cospi[62], x7[0], x7[1], output[15], output[0],
- v_cos_bit);
- btf_16_neon_mode3(cospi[10], cospi[54], x7[2], x7[3], output[13], output[2],
- v_cos_bit);
- btf_16_neon_mode3(cospi[18], cospi[46], x7[4], x7[5], output[11], output[4],
- v_cos_bit);
- btf_16_neon_mode3(cospi[26], cospi[38], x7[6], x7[7], output[9], output[6],
- v_cos_bit);
- btf_16_neon_mode3(cospi[34], cospi[30], x7[8], x7[9], output[7], output[8],
- v_cos_bit);
- btf_16_neon_mode3(cospi[42], cospi[22], x7[10], x7[11], output[5], output[10],
- v_cos_bit);
- btf_16_neon_mode3(cospi[50], cospi[14], x7[12], x7[13], output[3], output[12],
- v_cos_bit);
- btf_16_neon_mode3(cospi[58], cospi[6], x7[14], x7[15], output[1], output[14],
- v_cos_bit);
+ butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
+ &output[2]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
+ &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
+ &output[10]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
+ &output[12]);
+ butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
+ &output[1]);
}
-void av1_fidentity4x4_neon(const int16x8_t *const input,
- int16x8_t *const output, const int8_t cos_bit,
- const int8_t *stage_range) {
+static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
+ int16x4_t *const output,
+ const int cos_bit) {
(void)cos_bit;
- (void)stage_range;
- const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
- for (int i = 0; i < 4; ++i) {
- const int16x4_t b = vqrshrn_n_s32(
- vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
- output[i] = vcombine_s16(b, b);
- }
+ round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
}
-static INLINE void fidentity8x4_neon(const int16x8_t *const input,
- int16x8_t *const output,
- const int8_t cos_bit,
- const int8_t *stage_range) {
- (void)stage_range;
+static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
+ int16x8_t *const output,
+ const int cos_bit) {
(void)cos_bit;
- const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
- for (int i = 0; i < 4; ++i) {
- const int16x4_t b_lo = vqrshrn_n_s32(
- vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
- const int16x4_t b_hi = vqrshrn_n_s32(
- vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
- output[i] = vcombine_s16(b_lo, b_hi);
- }
+ round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
}
-void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, const int8_t *stage_range) {
+static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
(void)cos_bit;
- (void)stage_range;
- int16x8_t one = vdupq_n_s16(1);
- output[0] = vqrshlq_s16(input[0], one);
- output[1] = vqrshlq_s16(input[1], one);
- output[2] = vqrshlq_s16(input[2], one);
- output[3] = vqrshlq_s16(input[3], one);
- output[4] = vqrshlq_s16(input[4], one);
- output[5] = vqrshlq_s16(input[5], one);
- output[6] = vqrshlq_s16(input[6], one);
- output[7] = vqrshlq_s16(input[7], one);
+ shift_left_1_s16_x4(input, output, 8);
}
-static INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit,
- const int8_t *stage_range) {
- (void)stage_range;
+static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
(void)cos_bit;
- const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2 * 2);
- for (int i = 0; i < 16; ++i) {
- const int16x4_t b_lo = vqrshrn_n_s32(
- vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
- const int16x4_t b_hi = vqrshrn_n_s32(
- vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
- output[i] = vcombine_s16(b_lo, b_hi);
- }
+ shift_left_1_s16_x8(input, output, 8);
}
-static INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit,
- const int8_t *stage_range) {
- (void)stage_range;
+static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
+ int16x4_t *output,
+ int cos_bit) {
(void)cos_bit;
- for (int i = 0; i < 32; ++i) {
- output[i] = vshlq_n_s16(input[i], 2);
- }
+ round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
+ int16x8_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
}
-typedef void (*transform_1d_lbd_neon)(const int16x8_t *input, int16x8_t *output,
- int8_t cos_bit,
- const int8_t *stage_range);
-
-static const transform_1d_lbd_neon col_txfm4x4_arr[TX_TYPES] = {
- av1_fdct4x4_neon, // DCT_DCT
- av1_fadst4x4_neon, // ADST_DCT
- av1_fdct4x4_neon, // DCT_ADST
- av1_fadst4x4_neon, // ADST_ADST
- av1_fadst4x4_neon, // FLIPADST_DCT
- av1_fdct4x4_neon, // DCT_FLIPADST
- av1_fadst4x4_neon, // FLIPADST_FLIPADST
- av1_fadst4x4_neon, // ADST_FLIPADST
- av1_fadst4x4_neon, // FLIPADST_ADST
- av1_fidentity4x4_neon, // IDTX
- av1_fdct4x4_neon, // V_DCT
- av1_fidentity4x4_neon, // H_DCT
- av1_fadst4x4_neon, // V_ADST
- av1_fidentity4x4_neon, // H_ADST
- av1_fadst4x4_neon, // V_FLIPADST
- av1_fidentity4x4_neon // H_FLIPADST
+static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
+ int16x8_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ shift_left_2_s16_x8(input, output, 32);
+}
+
+#define TRANSFORM_COL(name, tw, n) \
+ static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
+ int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ load_buffer_s16_x##tw(input, stride, buf0, n); \
+ shift_left_2_s16_x##tw(buf0, buf0, n); \
+ name##_neon(buf0, output, cos_bit); \
+ }
+
+TRANSFORM_COL(fadst4x4, 4, 4)
+TRANSFORM_COL(fadst4x8, 4, 8)
+TRANSFORM_COL(fadst4x16, 4, 16)
+TRANSFORM_COL(fadst8x4, 8, 4)
+TRANSFORM_COL(fadst8x8, 8, 8)
+TRANSFORM_COL(fadst8x16, 8, 16)
+TRANSFORM_COL(fdct4x4, 4, 4)
+TRANSFORM_COL(fdct4x8, 4, 8)
+TRANSFORM_COL(fdct4x16, 4, 16)
+TRANSFORM_COL(fdct8x4, 8, 4)
+TRANSFORM_COL(fdct8x8, 8, 8)
+TRANSFORM_COL(fdct8x16, 8, 16)
+TRANSFORM_COL(fdct8x32, 8, 32)
+TRANSFORM_COL(fidentity4x4, 4, 4)
+TRANSFORM_COL(fidentity4x8, 4, 8)
+TRANSFORM_COL(fidentity4x16, 4, 16)
+TRANSFORM_COL(fidentity8x4, 8, 4)
+TRANSFORM_COL(fidentity8x8, 8, 8)
+TRANSFORM_COL(fidentity8x16, 8, 16)
+TRANSFORM_COL(fidentity8x32, 8, 32)
+
+#define TRANSFORM_ROW(name, tw, n) \
+ static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
+ int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ name##_neon(input, buf0, cos_bit); \
+ store_buffer_s16_x##tw(buf0, output, stride, n); \
+ }
+
+#define TRANSFORM_ROW_RECT(name, tw, n) \
+ static void name##_row_rect_neon(const int16x##tw##_t *input, \
+ int32_t *output, int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ name##_neon(input, buf0, cos_bit); \
+ store_rect_buffer_s16_x##tw(buf0, output, stride, n); \
+ }
+
+TRANSFORM_ROW(fadst4x4, 4, 4)
+TRANSFORM_ROW(fadst4x16, 4, 16)
+TRANSFORM_ROW(fadst8x4, 8, 4)
+TRANSFORM_ROW(fadst8x8, 8, 8)
+TRANSFORM_ROW(fadst8x16, 8, 16)
+TRANSFORM_ROW(fdct4x4, 4, 4)
+TRANSFORM_ROW(fdct4x16, 4, 16)
+TRANSFORM_ROW(fdct8x4, 8, 4)
+TRANSFORM_ROW(fdct8x8, 8, 8)
+TRANSFORM_ROW(fdct8x16, 8, 16)
+TRANSFORM_ROW(fdct8x32, 8, 32)
+TRANSFORM_ROW(fidentity4x4, 4, 4)
+TRANSFORM_ROW(fidentity4x16, 4, 16)
+TRANSFORM_ROW(fidentity8x4, 8, 4)
+TRANSFORM_ROW(fidentity8x8, 8, 8)
+TRANSFORM_ROW(fidentity8x16, 8, 16)
+TRANSFORM_ROW(fidentity8x32, 8, 32)
+
+TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
+TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
+TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
+TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
+TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
+TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
+TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
+TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
+TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
+TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
+TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
+TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
+
+typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
+ int16x4_t *output, int cos_bit);
+typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
+ int16x8_t *output, int cos_bit);
+
+typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
+ int16x4_t *output, int stride,
+ int cos_bit);
+typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
+ int16x8_t *output, int stride,
+ int cos_bit);
+
+typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
+ int32_t *output, int stride,
+ int cos_bit);
+typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
+ int32_t *output, int stride,
+ int cos_bit);
+
+static const col_transform_1d_lbd_4_neon col_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_col_neon, // DCT_DCT
+ fadst4x4_col_neon, // ADST_DCT
+ fdct4x4_col_neon, // DCT_ADST
+ fadst4x4_col_neon, // ADST_ADST
+ fadst4x4_col_neon, // FLIPADST_DCT
+ fdct4x4_col_neon, // DCT_FLIPADST
+ fadst4x4_col_neon, // FLIPADST_FLIPADST
+ fadst4x4_col_neon, // ADST_FLIPADST
+ fadst4x4_col_neon, // FLIPADST_ADST
+ fidentity4x4_col_neon, // IDTX
+ fdct4x4_col_neon, // V_DCT
+ fidentity4x4_col_neon, // H_DCT
+ fadst4x4_col_neon, // V_ADST
+ fidentity4x4_col_neon, // H_ADST
+ fadst4x4_col_neon, // V_FLIPADST
+ fidentity4x4_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_4_neon row_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_row_neon, // DCT_DCT
+ fdct4x4_row_neon, // ADST_DCT
+ fadst4x4_row_neon, // DCT_ADST
+ fadst4x4_row_neon, // ADST_ADST
+ fdct4x4_row_neon, // FLIPADST_DCT
+ fadst4x4_row_neon, // DCT_FLIPADST
+ fadst4x4_row_neon, // FLIPADST_FLIPADST
+ fadst4x4_row_neon, // ADST_FLIPADST
+ fadst4x4_row_neon, // FLIPADST_ADST
+ fidentity4x4_row_neon, // IDTX
+ fidentity4x4_row_neon, // V_DCT
+ fdct4x4_row_neon, // H_DCT
+ fidentity4x4_row_neon, // V_ADST
+ fadst4x4_row_neon, // H_ADST
+ fidentity4x4_row_neon, // V_FLIPADST
+ fadst4x4_row_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_col_neon, // DCT_DCT
+ fadst4x8_col_neon, // ADST_DCT
+ fdct4x8_col_neon, // DCT_ADST
+ fadst4x8_col_neon, // ADST_ADST
+ fadst4x8_col_neon, // FLIPADST_DCT
+ fdct4x8_col_neon, // DCT_FLIPADST
+ fadst4x8_col_neon, // FLIPADST_FLIPADST
+ fadst4x8_col_neon, // ADST_FLIPADST
+ fadst4x8_col_neon, // FLIPADST_ADST
+ fidentity4x8_col_neon, // IDTX
+ fdct4x8_col_neon, // V_DCT
+ fidentity4x8_col_neon, // H_DCT
+ fadst4x8_col_neon, // V_ADST
+ fidentity4x8_col_neon, // H_ADST
+ fadst4x8_col_neon, // V_FLIPADST
+ fidentity4x8_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_row_neon, // DCT_DCT
+ fdct8x4_row_neon, // ADST_DCT
+ fadst8x4_row_neon, // DCT_ADST
+ fadst8x4_row_neon, // ADST_ADST
+ fdct8x4_row_neon, // FLIPADST_DCT
+ fadst8x4_row_neon, // DCT_FLIPADST
+ fadst8x4_row_neon, // FLIPADST_FLIPADST
+ fadst8x4_row_neon, // ADST_FLIPADST
+ fadst8x4_row_neon, // FLIPADST_ADST
+ fidentity8x4_row_neon, // IDTX
+ fidentity8x4_row_neon, // V_DCT
+ fdct8x4_row_neon, // H_DCT
+ fidentity8x4_row_neon, // V_ADST
+ fadst8x4_row_neon, // H_ADST
+ fidentity8x4_row_neon, // V_FLIPADST
+ fadst8x4_row_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_row_rect_neon, // DCT_DCT
+ fdct8x4_row_rect_neon, // ADST_DCT
+ fadst8x4_row_rect_neon, // DCT_ADST
+ fadst8x4_row_rect_neon, // ADST_ADST
+ fdct8x4_row_rect_neon, // FLIPADST_DCT
+ fadst8x4_row_rect_neon, // DCT_FLIPADST
+ fadst8x4_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x4_row_rect_neon, // ADST_FLIPADST
+ fadst8x4_row_rect_neon, // FLIPADST_ADST
+ fidentity8x4_row_rect_neon, // IDTX
+ fidentity8x4_row_rect_neon, // V_DCT
+ fdct8x4_row_rect_neon, // H_DCT
+ fidentity8x4_row_rect_neon, // V_ADST
+ fadst8x4_row_rect_neon, // H_ADST
+ fidentity8x4_row_rect_neon, // V_FLIPADST
+ fadst8x4_row_rect_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon row_txfm4x4_arr[TX_TYPES] = {
- av1_fdct4x4_neon, // DCT_DCT
- av1_fdct4x4_neon, // ADST_DCT
- av1_fadst4x4_neon, // DCT_ADST
- av1_fadst4x4_neon, // ADST_ADST
- av1_fdct4x4_neon, // FLIPADST_DCT
- av1_fadst4x4_neon, // DCT_FLIPADST
- av1_fadst4x4_neon, // FLIPADST_FLIPADST
- av1_fadst4x4_neon, // ADST_FLIPADST
- av1_fadst4x4_neon, // FLIPADST_ADST
- av1_fidentity4x4_neon, // IDTX
- av1_fidentity4x4_neon, // V_DCT
- av1_fdct4x4_neon, // H_DCT
- av1_fidentity4x4_neon, // V_ADST
- av1_fadst4x4_neon, // H_ADST
- av1_fidentity4x4_neon, // V_FLIPADST
- av1_fadst4x4_neon // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_col_neon, // DCT_DCT
+ fadst8x4_col_neon, // ADST_DCT
+ fdct8x4_col_neon, // DCT_ADST
+ fadst8x4_col_neon, // ADST_ADST
+ fadst8x4_col_neon, // FLIPADST_DCT
+ fdct8x4_col_neon, // DCT_FLIPADST
+ fadst8x4_col_neon, // FLIPADST_FLIPADST
+ fadst8x4_col_neon, // ADST_FLIPADST
+ fadst8x4_col_neon, // FLIPADST_ADST
+ fidentity8x4_col_neon, // IDTX
+ fdct8x4_col_neon, // V_DCT
+ fidentity8x4_col_neon, // H_DCT
+ fadst8x4_col_neon, // V_ADST
+ fidentity8x4_col_neon, // H_ADST
+ fadst8x4_col_neon, // V_FLIPADST
+ fidentity8x4_col_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon col_txfm4x8_arr[TX_TYPES] = {
- fdct4x8_neon, // DCT_DCT
- fadst4x8_neon, // ADST_DCT
- fdct4x8_neon, // DCT_ADST
- fadst4x8_neon, // ADST_ADST
- fadst4x8_neon, // FLIPADST_DCT
- fdct4x8_neon, // DCT_FLIPADST
- fadst4x8_neon, // FLIPADST_FLIPADST
- fadst4x8_neon, // ADST_FLIPADST
- fadst4x8_neon, // FLIPADST_ADST
- fidentity8x8_neon, // IDTX
- fdct4x8_neon, // V_DCT
- fidentity8x8_neon, // H_DCT
- fadst4x8_neon, // V_ADST
- fidentity8x8_neon, // H_ADST
- fadst4x8_neon, // V_FLIPADST
- fidentity8x8_neon // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_row_rect_neon, // DCT_DCT
+ fdct4x8_row_rect_neon, // ADST_DCT
+ fadst4x8_row_rect_neon, // DCT_ADST
+ fadst4x8_row_rect_neon, // ADST_ADST
+ fdct4x8_row_rect_neon, // FLIPADST_DCT
+ fadst4x8_row_rect_neon, // DCT_FLIPADST
+ fadst4x8_row_rect_neon, // FLIPADST_FLIPADST
+ fadst4x8_row_rect_neon, // ADST_FLIPADST
+ fadst4x8_row_rect_neon, // FLIPADST_ADST
+ fidentity4x8_row_rect_neon, // IDTX
+ fidentity4x8_row_rect_neon, // V_DCT
+ fdct4x8_row_rect_neon, // H_DCT
+ fidentity4x8_row_rect_neon, // V_ADST
+ fadst4x8_row_rect_neon, // H_ADST
+ fidentity4x8_row_rect_neon, // V_FLIPADST
+ fadst4x8_row_rect_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon row_txfm8x4_arr[TX_TYPES] = {
- fdct8x4_neon, // DCT_DCT
- fdct8x4_neon, // ADST_DCT
- fadst8x4_neon, // DCT_ADST
- fadst8x4_neon, // ADST_ADST
- fdct8x4_neon, // FLIPADST_DCT
- fadst8x4_neon, // DCT_FLIPADST
- fadst8x4_neon, // FLIPADST_FLIPADST
- fadst8x4_neon, // ADST_FLIPADST
- fadst8x4_neon, // FLIPADST_ADST
- fidentity8x4_neon, // IDTX
- fidentity8x4_neon, // V_DCT
- fdct8x4_neon, // H_DCT
- fidentity8x4_neon, // V_ADST
- fadst8x4_neon, // H_ADST
- fidentity8x4_neon, // V_FLIPADST
- fadst8x4_neon // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_col_neon, // DCT_DCT
+ fadst8x8_col_neon, // ADST_DCT
+ fdct8x8_col_neon, // DCT_ADST
+ fadst8x8_col_neon, // ADST_ADST
+ fadst8x8_col_neon, // FLIPADST_DCT
+ fdct8x8_col_neon, // DCT_FLIPADST
+ fadst8x8_col_neon, // FLIPADST_FLIPADST
+ fadst8x8_col_neon, // ADST_FLIPADST
+ fadst8x8_col_neon, // FLIPADST_ADST
+ fidentity8x8_col_neon, // IDTX
+ fdct8x8_col_neon, // V_DCT
+ fidentity8x8_col_neon, // H_DCT
+ fadst8x8_col_neon, // V_ADST
+ fidentity8x8_col_neon, // H_ADST
+ fadst8x8_col_neon, // V_FLIPADST
+ fidentity8x8_col_neon, // H_FLIPADST
};
-static const transform_1d_lbd_neon col_txfm8x4_arr[TX_TYPES] = {
- fdct8x4_neon, // DCT_DCT
- fadst8x4_neon, // ADST_DCT
- fdct8x4_neon, // DCT_ADST
- fadst8x4_neon, // ADST_ADST
- fadst8x4_neon, // FLIPADST_DCT
- fdct8x4_neon, // DCT_FLIPADST
- fadst8x4_neon, // FLIPADST_FLIPADST
- fadst8x4_neon, // ADST_FLIPADST
- fadst8x4_neon, // FLIPADST_ADST
- fidentity8x4_neon, // IDTX
- fdct8x4_neon, // V_DCT
- fidentity8x4_neon, // H_DCT
- fadst8x4_neon, // V_ADST
- fidentity8x4_neon, // H_ADST
- fadst8x4_neon, // V_FLIPADST
- fidentity8x4_neon // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_row_neon, // DCT_DCT
+ fdct8x8_row_neon, // ADST_DCT
+ fadst8x8_row_neon, // DCT_ADST
+ fadst8x8_row_neon, // ADST_ADST
+ fdct8x8_row_neon, // FLIPADST_DCT
+ fadst8x8_row_neon, // DCT_FLIPADST
+ fadst8x8_row_neon, // FLIPADST_FLIPADST
+ fadst8x8_row_neon, // ADST_FLIPADST
+ fadst8x8_row_neon, // FLIPADST_ADST
+ fidentity8x8_row_neon, // IDTX
+ fidentity8x8_row_neon, // V_DCT
+ fdct8x8_row_neon, // H_DCT
+ fidentity8x8_row_neon, // V_ADST
+ fadst8x8_row_neon, // H_ADST
+ fidentity8x8_row_neon, // V_FLIPADST
+ fadst8x8_row_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon row_txfm4x8_arr[TX_TYPES] = {
- fdct4x8_neon, // DCT_DCT
- fdct4x8_neon, // ADST_DCT
- fadst4x8_neon, // DCT_ADST
- fadst4x8_neon, // ADST_ADST
- fdct4x8_neon, // FLIPADST_DCT
- fadst4x8_neon, // DCT_FLIPADST
- fadst4x8_neon, // FLIPADST_FLIPADST
- fadst4x8_neon, // ADST_FLIPADST
- fadst4x8_neon, // FLIPADST_ADST
- fidentity8x8_neon, // IDTX
- fidentity8x8_neon, // V_DCT
- fdct4x8_neon, // H_DCT
- fidentity8x8_neon, // V_ADST
- fadst4x8_neon, // H_ADST
- fidentity8x8_neon, // V_FLIPADST
- fadst4x8_neon // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_row_rect_neon, // DCT_DCT
+ fdct8x8_row_rect_neon, // ADST_DCT
+ fadst8x8_row_rect_neon, // DCT_ADST
+ fadst8x8_row_rect_neon, // ADST_ADST
+ fdct8x8_row_rect_neon, // FLIPADST_DCT
+ fadst8x8_row_rect_neon, // DCT_FLIPADST
+ fadst8x8_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x8_row_rect_neon, // ADST_FLIPADST
+ fadst8x8_row_rect_neon, // FLIPADST_ADST
+ fidentity8x8_row_rect_neon, // IDTX
+ fidentity8x8_row_rect_neon, // V_DCT
+ fdct8x8_row_rect_neon, // H_DCT
+ fidentity8x8_row_rect_neon, // V_ADST
+ fadst8x8_row_rect_neon, // H_ADST
+ fidentity8x8_row_rect_neon, // V_FLIPADST
+ fadst8x8_row_rect_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon col_txfm8x8_arr[TX_TYPES] = {
- fdct8x8_neon, // DCT_DCT
- fadst_8x8_neon, // ADST_DCT
- fdct8x8_neon, // DCT_ADST
- fadst_8x8_neon, // ADST_ADST
- fadst_8x8_neon, // FLIPADST_DCT
- fdct8x8_neon, // DCT_FLIPADST
- fadst_8x8_neon, // FLIPADST_FLIPADST
- fadst_8x8_neon, // ADST_FLIPADST
- fadst_8x8_neon, // FLIPADST_ADST
- fidentity8x8_neon, // IDTX
- fdct8x8_neon, // V_DCT
- fidentity8x8_neon, // H_DCT
- fadst_8x8_neon, // V_ADST
- fidentity8x8_neon, // H_ADST
- fadst_8x8_neon, // V_FLIPADST
- fidentity8x8_neon, // H_FLIPADST
+static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
+ fdct4x16_col_neon, // DCT_DCT
+ fadst4x16_col_neon, // ADST_DCT
+ fdct4x16_col_neon, // DCT_ADST
+ fadst4x16_col_neon, // ADST_ADST
+ fadst4x16_col_neon, // FLIPADST_DCT
+ fdct4x16_col_neon, // DCT_FLIPADST
+ fadst4x16_col_neon, // FLIPADST_FLIPADST
+ fadst4x16_col_neon, // ADST_FLIPADST
+ fadst4x16_col_neon, // FLIPADST_ADST
+ fidentity4x16_col_neon, // IDTX
+ fdct4x16_col_neon, // V_DCT
+ fidentity4x16_col_neon, // H_DCT
+ fadst4x16_col_neon, // V_ADST
+ fidentity4x16_col_neon, // H_ADST
+ fadst4x16_col_neon, // V_FLIPADST
+ fidentity4x16_col_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon row_txfm8x8_arr[TX_TYPES] = {
- fdct8x8_neon, // DCT_DCT
- fdct8x8_neon, // ADST_DCT
- fadst_8x8_neon, // DCT_ADST
- fadst_8x8_neon, // ADST_ADST
- fdct8x8_neon, // FLIPADST_DCT
- fadst_8x8_neon, // DCT_FLIPADST
- fadst_8x8_neon, // FLIPADST_FLIPADST
- fadst_8x8_neon, // ADST_FLIPADST
- fadst_8x8_neon, // FLIPADST_ADST
- fidentity8x8_neon, // IDTX
- fidentity8x8_neon, // V_DCT
- fdct8x8_neon, // H_DCT
- fidentity8x8_neon, // V_ADST
- fadst_8x8_neon, // H_ADST
- fidentity8x8_neon, // V_FLIPADST
- fadst_8x8_neon // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
+ fdct4x16_row_neon, // DCT_DCT
+ fdct4x16_row_neon, // ADST_DCT
+ fadst4x16_row_neon, // DCT_ADST
+ fadst4x16_row_neon, // ADST_ADST
+ fdct4x16_row_neon, // FLIPADST_DCT
+ fadst4x16_row_neon, // DCT_FLIPADST
+ fadst4x16_row_neon, // FLIPADST_FLIPADST
+ fadst4x16_row_neon, // ADST_FLIPADST
+ fadst4x16_row_neon, // FLIPADST_ADST
+ fidentity4x16_row_neon, // IDTX
+ fidentity4x16_row_neon, // V_DCT
+ fdct4x16_row_neon, // H_DCT
+ fidentity4x16_row_neon, // V_ADST
+ fadst4x16_row_neon, // H_ADST
+ fidentity4x16_row_neon, // V_FLIPADST
+ fadst4x16_row_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon col_txfm8x16_arr[TX_TYPES] = {
- fdct8x16_neon, // DCT_DCT
- fadst8x16_neon, // ADST_DCT
- fdct8x16_neon, // DCT_ADST
- fadst8x16_neon, // ADST_ADST
- fadst8x16_neon, // FLIPADST_DCT
- fdct8x16_neon, // DCT_FLIPADST
- fadst8x16_neon, // FLIPADST_FLIPADST
- fadst8x16_neon, // ADST_FLIPADST
- fadst8x16_neon, // FLIPADST_ADST
- fidentity8x16_neon, // IDTX
- fdct8x16_neon, // V_DCT
- fidentity8x16_neon, // H_DCT
- fadst8x16_neon, // V_ADST
- fidentity8x16_neon, // H_ADST
- fadst8x16_neon, // V_FLIPADST
- fidentity8x16_neon // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_col_neon, // DCT_DCT
+ fadst8x16_col_neon, // ADST_DCT
+ fdct8x16_col_neon, // DCT_ADST
+ fadst8x16_col_neon, // ADST_ADST
+ fadst8x16_col_neon, // FLIPADST_DCT
+ fdct8x16_col_neon, // DCT_FLIPADST
+ fadst8x16_col_neon, // FLIPADST_FLIPADST
+ fadst8x16_col_neon, // ADST_FLIPADST
+ fadst8x16_col_neon, // FLIPADST_ADST
+ fidentity8x16_col_neon, // IDTX
+ fdct8x16_col_neon, // V_DCT
+ fidentity8x16_col_neon, // H_DCT
+ fadst8x16_col_neon, // V_ADST
+ fidentity8x16_col_neon, // H_ADST
+ fadst8x16_col_neon, // V_FLIPADST
+ fidentity8x16_col_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon row_txfm8x16_arr[TX_TYPES] = {
- fdct8x16_neon, // DCT_DCT
- fdct8x16_neon, // ADST_DCT
- fadst8x16_neon, // DCT_ADST
- fadst8x16_neon, // ADST_ADST
- fdct8x16_neon, // FLIPADST_DCT
- fadst8x16_neon, // DCT_FLIPADST
- fadst8x16_neon, // FLIPADST_FLIPADST
- fadst8x16_neon, // ADST_FLIPADST
- fadst8x16_neon, // FLIPADST_ADST
- fidentity8x16_neon, // IDTX
- fidentity8x16_neon, // V_DCT
- fdct8x16_neon, // H_DCT
- fidentity8x16_neon, // V_ADST
- fadst8x16_neon, // H_ADST
- fidentity8x16_neon, // V_FLIPADST
- fadst8x16_neon // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_row_neon, // DCT_DCT
+ fdct8x16_row_neon, // ADST_DCT
+ fadst8x16_row_neon, // DCT_ADST
+ fadst8x16_row_neon, // ADST_ADST
+ fdct8x16_row_neon, // FLIPADST_DCT
+ fadst8x16_row_neon, // DCT_FLIPADST
+ fadst8x16_row_neon, // FLIPADST_FLIPADST
+ fadst8x16_row_neon, // ADST_FLIPADST
+ fadst8x16_row_neon, // FLIPADST_ADST
+ fidentity8x16_row_neon, // IDTX
+ fidentity8x16_row_neon, // V_DCT
+ fdct8x16_row_neon, // H_DCT
+ fidentity8x16_row_neon, // V_ADST
+ fadst8x16_row_neon, // H_ADST
+ fidentity8x16_row_neon, // V_FLIPADST
+ fadst8x16_row_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon row_txfm8x32_arr[TX_TYPES] = {
- av1_fdct8x32_neon, // DCT_DCT
- NULL, // ADST_DCT
- NULL, // DCT_ADST
- NULL, // ADST_ADST
- NULL, // FLIPADST_DCT
- NULL, // DCT_FLIPADST
- NULL, // FLIPADST_FLIPADST
- NULL, // ADST_FLIPADST
- NULL, // FLIPADST_ADST
- fidentity8x32_neon, // IDTX
- fidentity8x32_neon, // V_DCT
- av1_fdct8x32_neon, // H_DCT
- NULL, // V_ADST
- NULL, // H_ADST
- NULL, // V_FLIPADST
- NULL // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_row_rect_neon, // DCT_DCT
+ fdct8x16_row_rect_neon, // ADST_DCT
+ fadst8x16_row_rect_neon, // DCT_ADST
+ fadst8x16_row_rect_neon, // ADST_ADST
+ fdct8x16_row_rect_neon, // FLIPADST_DCT
+ fadst8x16_row_rect_neon, // DCT_FLIPADST
+ fadst8x16_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x16_row_rect_neon, // ADST_FLIPADST
+ fadst8x16_row_rect_neon, // FLIPADST_ADST
+ fidentity8x16_row_rect_neon, // IDTX
+ fidentity8x16_row_rect_neon, // V_DCT
+ fdct8x16_row_rect_neon, // H_DCT
+ fidentity8x16_row_rect_neon, // V_ADST
+ fadst8x16_row_rect_neon, // H_ADST
+ fidentity8x16_row_rect_neon, // V_FLIPADST
+ fadst8x16_row_rect_neon // H_FLIPADST
};
-static const transform_1d_lbd_neon col_txfm8x32_arr[TX_TYPES] = {
- av1_fdct8x32_neon, // DCT_DCT
- NULL, // ADST_DCT
- NULL, // DCT_ADST
- NULL, // ADST_ADST
- NULL, // FLIPADST_DCT
- NULL, // DCT_FLIPADST
- NULL, // FLIPADST_FLIPADST
- NULL, // ADST_FLIPADST
- NULL, // FLIPADST_ADST
- fidentity8x32_neon, // IDTX
- av1_fdct8x32_neon, // V_DCT
- fidentity8x32_neon, // H_DCT
- NULL, // V_ADST
- NULL, // H_ADST
- NULL, // V_FLIPADST
- NULL // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_row_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_row_neon, // IDTX
+ fidentity8x32_row_neon, // V_DCT
+ fdct8x32_row_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
};
-void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_row_rect_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_row_rect_neon, // IDTX
+ fidentity8x32_row_rect_neon, // V_DCT
+ fdct8x32_row_rect_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_col_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_col_neon, // IDTX
+ fdct8x32_col_neon, // V_DCT
+ fidentity8x32_col_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x8_t buf0[4], buf1[4], *buf;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
- const int txw_idx = get_txw_idx(TX_4X4);
- const int txh_idx = get_txh_idx(TX_4X4);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 4;
- const int height = 4;
- const transform_1d_lbd_neon col_txfm = col_txfm4x4_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm4x4_arr[tx_type];
+ int16x4_t buf0[4], buf1[4];
+ const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x4_arr[tx_type];
+ const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x4_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- if (ud_flip) {
- load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_4x4(buf0, buf1);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+ col_txfm(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1, buf, width);
+ flip_buf_4_neon(buf1, buf0, 4);
+ row_txfm(buf0, output, 4, 13);
} else {
- buf = buf1;
+ row_txfm(buf1, output, 4, 13);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
-
- store_buffer_16bit_to_32bit_w4(buf, output, height, width);
}
-void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- (void)stride;
+static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x8_t buf0[8], buf1[8], *buf;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
- const int txw_idx = get_txw_idx(TX_4X8);
- const int txh_idx = get_txh_idx(TX_4X8);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 4;
- const int height = 8;
- const transform_1d_lbd_neon col_txfm = col_txfm4x8_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
- int ud_flip, lr_flip;
+ int16x4_t buf0[8];
+ int16x8_t buf1[8];
+ const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- if (ud_flip) {
- load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_4x8(buf0, buf1);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x4(buf0, buf0, 8);
+ transpose_arrays_s16_4x8(buf0, buf1);
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1, buf, width);
+ int16x8_t buf2[8];
+ flip_buf_8_neon(buf1, buf2, 4);
+ row_txfm(buf2, output, 8, 13);
} else {
- buf = buf1;
+ row_txfm(buf1, output, 8, 13);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
}
-void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x8_t buf0[16], buf1[16];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
- const int txw_idx = get_txw_idx(TX_4X16);
- const int txh_idx = get_txh_idx(TX_4X16);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 4;
- const int height = 16;
- const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
+ int16x4_t buf0[16];
+ int16x8_t buf1[16];
+ const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- if (ud_flip) {
- load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_4x8(buf0, buf1);
- transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x4(buf0, buf0, 16);
+ transpose_arrays_s16_4x8(buf0, buf1);
+ transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
for (int i = 0; i < 2; i++) {
- int16x8_t *buf;
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + 8 * i, buf, width);
+ int16x8_t buf2[16];
+ flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
+ row_txfm(buf2, output + 8 * i, 16, 12);
} else {
- buf = buf1 + 8 * i;
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 16, 12);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
}
}
-void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x8_t buf0[8], buf1[8], *buf;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
- const int txw_idx = get_txw_idx(TX_8X4);
- const int txh_idx = get_txh_idx(TX_8X4);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 8;
- const int height = 4;
- const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm4x8_arr[tx_type];
+ int16x8_t buf0[8];
+ int16x4_t buf1[8];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+ const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- if (ud_flip)
- load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
- else
- load_buffer_16bit_to_16bit(input, stride, buf0, height);
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 4);
+ transpose_arrays_s16_8x4(buf0, buf1);
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1, buf, width);
+ int16x4_t buf2[8];
+ flip_buf_4_neon(buf1, buf2, 8);
+ row_txfm(buf2, output, 4, 13);
} else {
- buf = buf1;
+ row_txfm(buf1, output, 4, 13);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width);
}
-void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x8_t buf0[8], buf1[8], *buf;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
- const int txw_idx = get_txw_idx(TX_8X8);
- const int txh_idx = get_txh_idx(TX_8X8);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 8;
- const int height = 8;
- const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+ int16x8_t buf0[8], buf1[8];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- if (ud_flip)
- load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
- else
- load_buffer_16bit_to_16bit(input, stride, buf0, height);
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1, buf, width);
+ flip_buf_8_neon(buf1, buf0, 8);
+ row_txfm(buf0, output, 8, 13);
} else {
- buf = buf1;
+ row_txfm(buf1, output, 8, 13);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_buffer_16bit_to_32bit_w8(buf, output, height, width);
}
-void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[16], buf1[16];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
- const int txw_idx = get_txw_idx(TX_8X16);
- const int txh_idx = get_txh_idx(TX_8X16);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 8;
- const int height = 16;
- const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1);
- transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
for (int i = 0; i < 2; i++) {
- int16x8_t *buf;
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + width * i, buf, width);
+ flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+ row_txfm(buf0, output + 8 * i, 16, 13);
} else {
- buf = buf1 + width * i;
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 16, 13);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, 8);
}
}
-void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[32], buf1[32];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
- const int txw_idx = get_txw_idx(TX_8X32);
- const int txh_idx = get_txh_idx(TX_8X32);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 8;
- const int height = 32;
- const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1);
- transpose_16bit_8x8(buf0 + 8, buf1 + 8);
- transpose_16bit_8x8(buf0 + 16, buf1 + 16);
- transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ col_txfm(input, buf0, stride, 12);
+ shift_right_2_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+ transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
+ transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
for (int i = 0; i < 4; i++) {
- int16x8_t *buf;
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + width * i, buf, width);
+ flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+ row_txfm(buf0, output + 8 * i, 32, 12);
} else {
- buf = buf1 + width * i;
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 32, 12);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
}
}
-void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x8_t buf0[16], buf1[16];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
- const int txw_idx = get_txw_idx(TX_16X4);
- const int txh_idx = get_txh_idx(TX_16X4);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 16;
- const int height = 4;
- const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
- int16x8_t *buf;
+ int16x8_t buf0[16];
+ int16x4_t buf1[16];
+ int16x4_t buf2[16];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+ const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
for (int i = 0; i < 2; i++) {
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x4(buf0, buf1 + 8 * i);
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 4);
+ transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
}
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1, buf, width);
+ flip_buf_4_neon(buf1, buf2, 16);
+ row_txfm(buf2, output, 4, 13);
} else {
- buf = buf1;
+ row_txfm(buf1, output, 4, 13);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_buffer_16bit_to_32bit_w4(buf, output, height, width);
}
-void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[16], buf1[16];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
- const int txw_idx = get_txw_idx(TX_16X8);
- const int txh_idx = get_txh_idx(TX_16X8);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 16;
- const int height = 8;
- const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
- int16x8_t *buf;
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
for (int i = 0; i < 2; i++) {
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1 + 8 * i);
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
}
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1, buf, width);
+ flip_buf_8_neon(buf1, buf0, 16);
+ row_txfm(buf0, output, 8, 13);
} else {
- buf = buf1;
+ row_txfm(buf1, output, 8, 13);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
}
-void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[16], buf1[32];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
- const int txw_idx = get_txw_idx(TX_16X16);
- const int txh_idx = get_txh_idx(TX_16X16);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 16;
- const int height = 16;
- const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
for (int i = 0; i < 2; i++) {
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
}
for (int i = 0; i < 2; i++) {
- int16x8_t *buf;
if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + width * i, buf, width);
+ flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+ row_txfm(buf0, output + 8 * i, 16, 12);
} else {
- buf = buf1 + width * i;
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, output + 8 * i, 16, 12);
}
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
}
}
-void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[32], buf1[64];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
- const int txw_idx = get_txw_idx(TX_16X32);
- const int txh_idx = get_txh_idx(TX_16X32);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 16;
- const int height = 32;
- const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
-
- if (col_txfm != NULL && row_txfm != NULL) {
- int ud_flip, lr_flip;
- get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-
- for (int i = 0; i < 2; i++) {
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
- }
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
- for (int i = 0; i < 4; i++) {
- int16x8_t *buf;
- if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + width * i, buf, width);
- } else {
- buf = buf1 + width * i;
- }
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift2);
- store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
- }
- } else {
+ if (col_txfm == NULL || row_txfm == NULL) {
av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+ row_txfm(buf0, output + 8 * i, 32, 13);
+ } else {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, output + 8 * i, 32, 13);
+ }
}
}
-void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[32], buf1[32];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
- const int txw_idx = get_txw_idx(TX_32X8);
- const int txh_idx = get_txh_idx(TX_32X8);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 32;
- const int height = 8;
- const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
-
- if (col_txfm != NULL && row_txfm != NULL) {
- int ud_flip, lr_flip;
- get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-
- for (int i = 0; i < 4; i++) {
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
- }
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
- for (int i = 0; i < 1; i++) {
- int16x8_t *buf;
- if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + width * i, buf, width);
- } else {
- buf = buf1 + width * i;
- }
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf, width, &v_shift2);
- store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
- }
- } else {
+ if (col_txfm == NULL || row_txfm == NULL) {
av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+ }
+
+ if (lr_flip) {
+ flip_buf_8_neon(buf1, buf0, 32);
+ row_txfm(buf0, output, 8, 12);
+ } else {
+ row_txfm(buf1, output, 8, 12);
}
}
-void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[32], buf1[64];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
- const int txw_idx = get_txw_idx(TX_32X16);
- const int txh_idx = get_txh_idx(TX_32X16);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 32;
- const int height = 16;
- const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
-
- if (col_txfm != NULL && row_txfm != NULL) {
- const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
- const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
- const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
- int ud_flip, lr_flip;
- get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
- for (int i = 0; i < 4; i++) {
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- }
- round_shift_16bit_vector(buf0, height, &v_shift0);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit_vector(buf0, height, &v_shift1);
- transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
- }
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
- for (int i = 0; i < 2; i++) {
- int16x8_t *buf;
- if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + width * i, buf, width);
- } else {
- buf = buf1 + width * i;
- }
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit_vector(buf, width, &v_shift2);
- store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
- }
- } else {
+ if (col_txfm == NULL || row_txfm == NULL) {
av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_4_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+ row_txfm(buf0, output + 8 * i, 16, 13);
+ } else {
+ int16x8_t *buf = buf1 + 32 * i;
+ row_txfm(buf, output + 8 * i, 16, 13);
+ }
}
}
-void av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
int16x8_t buf0[32], buf1[128];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
- const int txw_idx = get_txw_idx(TX_32X32);
- const int txh_idx = get_txh_idx(TX_32X32);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = 32;
- const int height = 32;
- const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
- const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
-
- if (col_txfm != NULL && row_txfm != NULL) {
- int ud_flip, lr_flip;
- get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
- for (int i = 0; i < 4; i++) {
- if (ud_flip) {
- load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
- } else {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- }
- round_shift_16bit(buf0, height, shift[0]);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit(buf0, height, shift[1]);
- transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
- transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
- }
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
- for (int i = 0; i < 4; i++) {
- int16x8_t *buf;
- if (lr_flip) {
- buf = buf0;
- flip_buf_neon(buf1 + width * i, buf, width);
- } else {
- buf = buf1 + width * i;
- }
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit(buf, width, shift[2]);
- store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
- }
- } else {
+ if (col_txfm == NULL || row_txfm == NULL) {
av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+ row_txfm(buf0, output + 8 * i, 32, 12);
+ } else {
+ int16x8_t *buf = buf1 + 32 * i;
+ row_txfm(buf, output + 8 * i, 32, 12);
+ }
}
}
-void av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
(void)tx_type;
assert(tx_type == DCT_DCT);
- const TX_SIZE tx_size = TX_64X16;
int16x8_t buf0[64], buf1[128];
- const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = tx_size_wide[tx_size];
- const int height = tx_size_high[tx_size];
- const transform_1d_lbd_neon col_txfm = fdct8x16_neon;
- const transform_1d_lbd_neon row_txfm = av1_fdct8x64_neon;
- const int width_div8 = (width >> 3);
- const int height_div8 = (height >> 3);
-
- for (int i = 0; i < width_div8; i++) {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- round_shift_16bit(buf0, height, shift[0]);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit(buf0, height, shift[1]);
- for (int j = 0; j < height_div8; ++j) {
- transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
+ const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 8; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
+ shift_left_2_s16_x8(buf0, buf0, 16);
+ col_txfm(buf0, buf0, 13);
+ shift_right_4_round_s16_x8(buf0, buf0, 16);
+ for (int j = 0; j < 2; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
}
}
- for (int i = 0; i < height_div8; i++) {
- int16x8_t *buf = buf1 + width * i;
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit(buf, width, shift[2]);
- store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32);
+ for (int i = 0; i < 2; i++) {
+ int16x8_t *buf = buf1 + 64 * i;
+ row_txfm(buf, buf, 12);
+ store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
}
// Zero out the bottom 16x32 area.
memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
}
-void av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
(void)tx_type;
assert(tx_type == DCT_DCT);
- const TX_SIZE tx_size = TX_16X64;
int16x8_t buf0[64], buf1[128];
- const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = tx_size_wide[tx_size];
- const int height = tx_size_high[tx_size];
- const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
- const transform_1d_lbd_neon row_txfm = fdct8x16_neon;
- const int width_div8 = (width >> 3);
- const int height_div8 = (height >> 3);
-
- for (int i = 0; i < width_div8; i++) {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- round_shift_16bit(buf0, height, shift[0]);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit(buf0, height, shift[1]);
- for (int j = 0; j < height_div8; ++j) {
- transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+ const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 8; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
}
}
- for (int i = 0; i < AOMMIN(4, height_div8); i++) {
- int16x8_t *buf = buf1 + width * i;
- row_txfm(buf, buf, cos_bit_row, NULL);
- round_shift_16bit(buf, width, shift[2]);
- store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
+ for (int i = 0; i < 4; i++) {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, buf, 12);
+ store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
}
}
-#define TRANSPOSE_4X4_L32(x0, x1, x2, x3, y0, y1, y2, y3) \
- do { \
- int32x4x2_t temp01 = vzipq_s32(x0, x1); \
- int32x4x2_t temp23 = vzipq_s32(x2, x3); \
- int32x4x2_t y01 = vzipq_s32(temp01.val[0], temp23.val[0]); \
- int32x4x2_t y23 = vzipq_s32(temp01.val[1], temp23.val[1]); \
- y0 = y01.val[0]; \
- y1 = y01.val[1]; \
- y2 = y23.val[0]; \
- y3 = y23.val[1]; \
- } while (0)
+static void fdct32_new_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
-static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
- int cos_bit, const int stride,
- const int8_t *stage_range) {
- (void)stage_range;
int32x4_t buf0[32];
int32x4_t buf1[32];
- const int32_t *cospi;
- cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
- int startidx = 0 * stride;
- int endidx = 31 * stride;
- // stage 0
// stage 1
- buf1[0] = vaddq_s32(input[startidx], input[endidx]);
- buf1[31] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[1] = vaddq_s32(input[startidx], input[endidx]);
- buf1[30] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[2] = vaddq_s32(input[startidx], input[endidx]);
- buf1[29] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[3] = vaddq_s32(input[startidx], input[endidx]);
- buf1[28] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[4] = vaddq_s32(input[startidx], input[endidx]);
- buf1[27] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[5] = vaddq_s32(input[startidx], input[endidx]);
- buf1[26] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[6] = vaddq_s32(input[startidx], input[endidx]);
- buf1[25] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[7] = vaddq_s32(input[startidx], input[endidx]);
- buf1[24] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[8] = vaddq_s32(input[startidx], input[endidx]);
- buf1[23] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[9] = vaddq_s32(input[startidx], input[endidx]);
- buf1[22] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[10] = vaddq_s32(input[startidx], input[endidx]);
- buf1[21] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[11] = vaddq_s32(input[startidx], input[endidx]);
- buf1[20] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[12] = vaddq_s32(input[startidx], input[endidx]);
- buf1[19] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[13] = vaddq_s32(input[startidx], input[endidx]);
- buf1[18] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[14] = vaddq_s32(input[startidx], input[endidx]);
- buf1[17] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[15] = vaddq_s32(input[startidx], input[endidx]);
- buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+ butterfly_dct_pre_s32_x4(input, buf1, 32);
// stage 2
- buf0[0] = vaddq_s32(buf1[0], buf1[15]);
- buf0[15] = vsubq_s32(buf1[0], buf1[15]);
- buf0[1] = vaddq_s32(buf1[1], buf1[14]);
- buf0[14] = vsubq_s32(buf1[1], buf1[14]);
- buf0[2] = vaddq_s32(buf1[2], buf1[13]);
- buf0[13] = vsubq_s32(buf1[2], buf1[13]);
- buf0[3] = vaddq_s32(buf1[3], buf1[12]);
- buf0[12] = vsubq_s32(buf1[3], buf1[12]);
- buf0[4] = vaddq_s32(buf1[4], buf1[11]);
- buf0[11] = vsubq_s32(buf1[4], buf1[11]);
- buf0[5] = vaddq_s32(buf1[5], buf1[10]);
- buf0[10] = vsubq_s32(buf1[5], buf1[10]);
- buf0[6] = vaddq_s32(buf1[6], buf1[9]);
- buf0[9] = vsubq_s32(buf1[6], buf1[9]);
- buf0[7] = vaddq_s32(buf1[7], buf1[8]);
- buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+ butterfly_dct_pre_s32_x4(buf1, buf0, 16);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
buf0[18] = buf1[18];
buf0[19] = buf1[19];
- btf_32_neon_mode0(cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
- buf0[27], v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
- buf0[25], v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
- buf0[24], v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
+ &buf0[20]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
+ &buf0[22]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
+ &buf0[23]);
buf0[28] = buf1[28];
buf0[29] = buf1[29];
buf0[30] = buf1[30];
buf0[31] = buf1[31];
// stage 3
- cospi = cospi_arr(cos_bit);
- buf1[0] = vaddq_s32(buf0[0], buf0[7]);
- buf1[7] = vsubq_s32(buf0[0], buf0[7]);
- buf1[1] = vaddq_s32(buf0[1], buf0[6]);
- buf1[6] = vsubq_s32(buf0[1], buf0[6]);
- buf1[2] = vaddq_s32(buf0[2], buf0[5]);
- buf1[5] = vsubq_s32(buf0[2], buf0[5]);
- buf1[3] = vaddq_s32(buf0[3], buf0[4]);
- buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+ butterfly_dct_pre_s32_x4(buf0, buf1, 8);
buf1[8] = buf0[8];
buf1[9] = buf0[9];
- btf_32_neon_mode0(cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
- buf1[13], v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
- buf1[12], v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
+ &buf1[10]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
+ &buf1[11]);
buf1[14] = buf0[14];
buf1[15] = buf0[15];
- buf1[16] = vaddq_s32(buf0[16], buf0[23]);
- buf1[23] = vsubq_s32(buf0[16], buf0[23]);
- buf1[17] = vaddq_s32(buf0[17], buf0[22]);
- buf1[22] = vsubq_s32(buf0[17], buf0[22]);
- buf1[18] = vaddq_s32(buf0[18], buf0[21]);
- buf1[21] = vsubq_s32(buf0[18], buf0[21]);
- buf1[19] = vaddq_s32(buf0[19], buf0[20]);
- buf1[20] = vsubq_s32(buf0[19], buf0[20]);
- buf1[24] = vsubq_s32(buf0[31], buf0[24]);
- buf1[31] = vaddq_s32(buf0[31], buf0[24]);
- buf1[25] = vsubq_s32(buf0[30], buf0[25]);
- buf1[30] = vaddq_s32(buf0[30], buf0[25]);
- buf1[26] = vsubq_s32(buf0[29], buf0[26]);
- buf1[29] = vaddq_s32(buf0[29], buf0[26]);
- buf1[27] = vsubq_s32(buf0[28], buf0[27]);
- buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
// stage 4
- cospi = cospi_arr(cos_bit);
- buf0[0] = vaddq_s32(buf1[0], buf1[3]);
- buf0[3] = vsubq_s32(buf1[0], buf1[3]);
- buf0[1] = vaddq_s32(buf1[1], buf1[2]);
- buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+ butterfly_dct_pre_s32_x4(buf1, buf0, 4);
buf0[4] = buf1[4];
- btf_32_neon_mode0(cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
- v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
buf0[7] = buf1[7];
- buf0[8] = vaddq_s32(buf1[8], buf1[11]);
- buf0[11] = vsubq_s32(buf1[8], buf1[11]);
- buf0[9] = vaddq_s32(buf1[9], buf1[10]);
- buf0[10] = vsubq_s32(buf1[9], buf1[10]);
- buf0[12] = vsubq_s32(buf1[15], buf1[12]);
- buf0[15] = vaddq_s32(buf1[15], buf1[12]);
- buf0[13] = vsubq_s32(buf1[14], buf1[13]);
- buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+ butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
- btf_32_neon_mode0(cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
- buf0[29], v_cos_bit);
- btf_32_neon_mode0(cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
- buf0[28], v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], buf1[20], buf1[27], buf0[20],
- buf0[27], v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
+ &buf0[18]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
+ &buf0[19]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
+ &buf0[20]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
buf0[22] = buf1[22];
buf0[23] = buf1[23];
buf0[24] = buf1[24];
@@ -3202,69 +2467,40 @@ static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
buf0[31] = buf1[31];
// stage 5
- cospi = cospi_arr(cos_bit);
- btf_32_neon(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
- v_cos_bit);
- btf_32_type1_neon(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
- v_cos_bit);
- buf1[4] = vaddq_s32(buf0[4], buf0[5]);
- buf1[5] = vsubq_s32(buf0[4], buf0[5]);
- buf1[6] = vsubq_s32(buf0[7], buf0[6]);
- buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
+ butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
buf1[8] = buf0[8];
- btf_32_neon_mode0(cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
- v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], buf0[10], buf0[13], buf1[10],
- buf1[13], v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
+ &buf1[9]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
+ &buf1[10]);
buf1[11] = buf0[11];
buf1[12] = buf0[12];
buf1[15] = buf0[15];
- buf1[16] = vaddq_s32(buf0[16], buf0[19]);
- buf1[19] = vsubq_s32(buf0[16], buf0[19]);
- buf1[17] = vaddq_s32(buf0[17], buf0[18]);
- buf1[18] = vsubq_s32(buf0[17], buf0[18]);
- buf1[20] = vsubq_s32(buf0[23], buf0[20]);
- buf1[23] = vaddq_s32(buf0[23], buf0[20]);
- buf1[21] = vsubq_s32(buf0[22], buf0[21]);
- buf1[22] = vaddq_s32(buf0[22], buf0[21]);
- buf1[24] = vaddq_s32(buf0[24], buf0[27]);
- buf1[27] = vsubq_s32(buf0[24], buf0[27]);
- buf1[25] = vaddq_s32(buf0[25], buf0[26]);
- buf1[26] = vsubq_s32(buf0[25], buf0[26]);
- buf1[28] = vsubq_s32(buf0[31], buf0[28]);
- buf1[31] = vaddq_s32(buf0[31], buf0[28]);
- buf1[29] = vsubq_s32(buf0[30], buf0[29]);
- buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+ butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
// stage 6
- cospi = cospi_arr(cos_bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
- btf_32_type1_neon(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
- v_cos_bit);
- btf_32_type1_neon(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
- v_cos_bit);
- buf0[8] = vaddq_s32(buf1[8], buf1[9]);
- buf0[9] = vsubq_s32(buf1[8], buf1[9]);
- buf0[10] = vsubq_s32(buf1[11], buf1[10]);
- buf0[11] = vaddq_s32(buf1[11], buf1[10]);
- buf0[12] = vaddq_s32(buf1[12], buf1[13]);
- buf0[13] = vsubq_s32(buf1[12], buf1[13]);
- buf0[14] = vsubq_s32(buf1[15], buf1[14]);
- buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+ butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
+ butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+ butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
buf0[16] = buf1[16];
- btf_32_neon_mode0(cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30],
- v_cos_bit);
- btf_32_neon_mode01(cospi[56], cospi[8], buf1[18], buf1[29], buf0[18],
- buf0[29], v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
+ &buf0[17]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
+ &buf0[18]);
buf0[19] = buf1[19];
buf0[20] = buf1[20];
- btf_32_neon_mode0(cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
- btf_32_neon_mode01(cospi[24], cospi[40], buf1[22], buf1[25], buf0[22],
- buf0[25], v_cos_bit);
+ butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
+ &buf0[22]);
buf0[23] = buf1[23];
buf0[24] = buf1[24];
buf0[27] = buf1[27];
@@ -3272,7 +2508,6 @@ static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
buf0[31] = buf1[31];
// stage 7
- cospi = cospi_arr(cos_bit);
buf1[0] = buf0[0];
buf1[1] = buf0[1];
buf1[2] = buf0[2];
@@ -3281,34 +2516,20 @@ static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
buf1[5] = buf0[5];
buf1[6] = buf0[6];
buf1[7] = buf0[7];
-
- btf_32_type1_neon(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
- v_cos_bit);
- btf_32_type1_neon(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
- v_cos_bit);
- btf_32_type1_neon(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
- buf1[13], v_cos_bit);
- btf_32_type1_neon(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
- buf1[12], v_cos_bit);
- buf1[16] = vaddq_s32(buf0[16], buf0[17]);
- buf1[17] = vsubq_s32(buf0[16], buf0[17]);
- buf1[18] = vsubq_s32(buf0[19], buf0[18]);
- buf1[19] = vaddq_s32(buf0[19], buf0[18]);
- buf1[20] = vaddq_s32(buf0[20], buf0[21]);
- buf1[21] = vsubq_s32(buf0[20], buf0[21]);
- buf1[22] = vsubq_s32(buf0[23], buf0[22]);
- buf1[23] = vaddq_s32(buf0[23], buf0[22]);
- buf1[24] = vaddq_s32(buf0[24], buf0[25]);
- buf1[25] = vsubq_s32(buf0[24], buf0[25]);
- buf1[26] = vsubq_s32(buf0[27], buf0[26]);
- buf1[27] = vaddq_s32(buf0[27], buf0[26]);
- buf1[28] = vaddq_s32(buf0[28], buf0[29]);
- buf1[29] = vsubq_s32(buf0[28], buf0[29]);
- buf1[30] = vsubq_s32(buf0[31], buf0[30]);
- buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+ butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
+ &buf1[15]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
+ &buf1[14]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
+ &buf1[13]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
+ &buf1[12]);
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+ butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+ butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+ butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
// stage 8
- cospi = cospi_arr(cos_bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
@@ -3325,988 +2546,408 @@ static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
buf0[13] = buf1[13];
buf0[14] = buf1[14];
buf0[15] = buf1[15];
+ butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
+ &buf0[31]);
+ butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
+ &buf0[30]);
+ butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
+ &buf0[29]);
+ butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
+ &buf0[28]);
+ butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
+ &buf0[27]);
+ butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
+ &buf0[26]);
+ butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
+ &buf0[25]);
+ butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
+ &buf0[24]);
- btf_32_type1_neon(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
- v_cos_bit);
- btf_32_type1_neon(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
- buf0[30], v_cos_bit);
- btf_32_type1_neon(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
- buf0[29], v_cos_bit);
- btf_32_type1_neon(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
- buf0[28], v_cos_bit);
- btf_32_type1_neon(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
- buf0[27], v_cos_bit);
- btf_32_type1_neon(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
- btf_32_type1_neon(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
- buf0[25], v_cos_bit);
- btf_32_type1_neon(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
- v_cos_bit);
-
- startidx = 0 * stride;
- endidx = 31 * stride;
// stage 9
- output[startidx] = buf0[0];
- output[endidx] = buf0[31];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[16];
- output[endidx] = buf0[15];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[8];
- output[endidx] = buf0[23];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[24];
- output[endidx] = buf0[7];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[4];
- output[endidx] = buf0[27];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[20];
- output[endidx] = buf0[11];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[12];
- output[endidx] = buf0[19];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[28];
- output[endidx] = buf0[3];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[2];
- output[endidx] = buf0[29];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[18];
- output[endidx] = buf0[13];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[10];
- output[endidx] = buf0[21];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[26];
- output[endidx] = buf0[5];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[6];
- output[endidx] = buf0[25];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[22];
- output[endidx] = buf0[9];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[14];
- output[endidx] = buf0[17];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[30];
- output[endidx] = buf0[1];
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
}
-static void av1_fdct64_new_stage1234_neon(int32x4_t *input, const int instride,
- int32x4_t *x3, int32x4_t *x4,
- const int32_t *cospi,
- const int32x4_t *v_cos_bit,
- int *startidx, int *endidx) {
+static void fdct64_new_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+ const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+ const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+ const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+ const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+ const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+ const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+ const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+ const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+ const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+ const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+ const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+ const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+ const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+ const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+ const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+ const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+ const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+ const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+ const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+ const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+ const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+ const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+ const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+ const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
// stage 1
int32x4_t x1[64];
- x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+ butterfly_dct_pre_s32_x4(input, x1, 64);
// stage 2
int32x4_t x2[64];
- x2[0] = vaddq_s32(x1[0], x1[31]);
- x2[31] = vsubq_s32(x1[0], x1[31]);
- x2[1] = vaddq_s32(x1[1], x1[30]);
- x2[30] = vsubq_s32(x1[1], x1[30]);
- x2[2] = vaddq_s32(x1[2], x1[29]);
- x2[29] = vsubq_s32(x1[2], x1[29]);
- x2[3] = vaddq_s32(x1[3], x1[28]);
- x2[28] = vsubq_s32(x1[3], x1[28]);
- x2[4] = vaddq_s32(x1[4], x1[27]);
- x2[27] = vsubq_s32(x1[4], x1[27]);
- x2[5] = vaddq_s32(x1[5], x1[26]);
- x2[26] = vsubq_s32(x1[5], x1[26]);
- x2[6] = vaddq_s32(x1[6], x1[25]);
- x2[25] = vsubq_s32(x1[6], x1[25]);
- x2[7] = vaddq_s32(x1[7], x1[24]);
- x2[24] = vsubq_s32(x1[7], x1[24]);
- x2[8] = vaddq_s32(x1[8], x1[23]);
- x2[23] = vsubq_s32(x1[8], x1[23]);
- x2[9] = vaddq_s32(x1[9], x1[22]);
- x2[22] = vsubq_s32(x1[9], x1[22]);
- x2[10] = vaddq_s32(x1[10], x1[21]);
- x2[21] = vsubq_s32(x1[10], x1[21]);
- x2[11] = vaddq_s32(x1[11], x1[20]);
- x2[20] = vsubq_s32(x1[11], x1[20]);
- x2[12] = vaddq_s32(x1[12], x1[19]);
- x2[19] = vsubq_s32(x1[12], x1[19]);
- x2[13] = vaddq_s32(x1[13], x1[18]);
- x2[18] = vsubq_s32(x1[13], x1[18]);
- x2[14] = vaddq_s32(x1[14], x1[17]);
- x2[17] = vsubq_s32(x1[14], x1[17]);
- x2[15] = vaddq_s32(x1[15], x1[16]);
- x2[16] = vsubq_s32(x1[15], x1[16]);
-
- btf_32_neon_mode0(cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
- *v_cos_bit);
+ butterfly_dct_pre_s32_x4(x1, x2, 32);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
// stage 3
- x3[0] = vaddq_s32(x2[0], x2[15]);
- x3[15] = vsubq_s32(x2[0], x2[15]);
- x3[1] = vaddq_s32(x2[1], x2[14]);
- x3[14] = vsubq_s32(x2[1], x2[14]);
- x3[2] = vaddq_s32(x2[2], x2[13]);
- x3[13] = vsubq_s32(x2[2], x2[13]);
- x3[3] = vaddq_s32(x2[3], x2[12]);
- x3[12] = vsubq_s32(x2[3], x2[12]);
- x3[4] = vaddq_s32(x2[4], x2[11]);
- x3[11] = vsubq_s32(x2[4], x2[11]);
- x3[5] = vaddq_s32(x2[5], x2[10]);
- x3[10] = vsubq_s32(x2[5], x2[10]);
- x3[6] = vaddq_s32(x2[6], x2[9]);
- x3[9] = vsubq_s32(x2[6], x2[9]);
- x3[7] = vaddq_s32(x2[7], x2[8]);
- x3[8] = vsubq_s32(x2[7], x2[8]);
-
- btf_32_neon_mode0(cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
- *v_cos_bit);
-
- x3[32] = vaddq_s32(x1[32], x2[47]);
- x3[47] = vsubq_s32(x1[32], x2[47]);
- x3[33] = vaddq_s32(x1[33], x2[46]);
- x3[46] = vsubq_s32(x1[33], x2[46]);
- x3[34] = vaddq_s32(x1[34], x2[45]);
- x3[45] = vsubq_s32(x1[34], x2[45]);
- x3[35] = vaddq_s32(x1[35], x2[44]);
- x3[44] = vsubq_s32(x1[35], x2[44]);
- x3[36] = vaddq_s32(x1[36], x2[43]);
- x3[43] = vsubq_s32(x1[36], x2[43]);
- x3[37] = vaddq_s32(x1[37], x2[42]);
- x3[42] = vsubq_s32(x1[37], x2[42]);
- x3[38] = vaddq_s32(x1[38], x2[41]);
- x3[41] = vsubq_s32(x1[38], x2[41]);
- x3[39] = vaddq_s32(x1[39], x2[40]);
- x3[40] = vsubq_s32(x1[39], x2[40]);
- x3[48] = vsubq_s32(x1[63], x2[48]);
- x3[63] = vaddq_s32(x1[63], x2[48]);
- x3[49] = vsubq_s32(x1[62], x2[49]);
- x3[62] = vaddq_s32(x1[62], x2[49]);
- x3[50] = vsubq_s32(x1[61], x2[50]);
- x3[61] = vaddq_s32(x1[61], x2[50]);
- x3[51] = vsubq_s32(x1[60], x2[51]);
- x3[60] = vaddq_s32(x1[60], x2[51]);
- x3[52] = vsubq_s32(x1[59], x2[52]);
- x3[59] = vaddq_s32(x1[59], x2[52]);
- x3[53] = vsubq_s32(x1[58], x2[53]);
- x3[58] = vaddq_s32(x1[58], x2[53]);
- x3[54] = vsubq_s32(x1[57], x2[54]);
- x3[57] = vaddq_s32(x1[57], x2[54]);
- x3[55] = vsubq_s32(x1[56], x2[55]);
- x3[56] = vaddq_s32(x1[56], x2[55]);
+ int32x4_t x3[64];
+ butterfly_dct_pre_s32_x4(x2, x3, 16);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+ butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
// stage 4
- x4[0] = vaddq_s32(x3[0], x3[7]);
- x4[7] = vsubq_s32(x3[0], x3[7]);
- x4[1] = vaddq_s32(x3[1], x3[6]);
- x4[6] = vsubq_s32(x3[1], x3[6]);
- x4[2] = vaddq_s32(x3[2], x3[5]);
- x4[5] = vsubq_s32(x3[2], x3[5]);
- x4[3] = vaddq_s32(x3[3], x3[4]);
- x4[4] = vsubq_s32(x3[3], x3[4]);
-
- btf_32_neon_mode0(cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
- *v_cos_bit);
-
- x4[16] = vaddq_s32(x2[16], x3[23]);
- x4[23] = vsubq_s32(x2[16], x3[23]);
- x4[17] = vaddq_s32(x2[17], x3[22]);
- x4[22] = vsubq_s32(x2[17], x3[22]);
- x4[18] = vaddq_s32(x2[18], x3[21]);
- x4[21] = vsubq_s32(x2[18], x3[21]);
- x4[19] = vaddq_s32(x2[19], x3[20]);
- x4[20] = vsubq_s32(x2[19], x3[20]);
- x4[24] = vsubq_s32(x2[31], x3[24]);
- x4[31] = vaddq_s32(x2[31], x3[24]);
- x4[25] = vsubq_s32(x2[30], x3[25]);
- x4[30] = vaddq_s32(x2[30], x3[25]);
- x4[26] = vsubq_s32(x2[29], x3[26]);
- x4[29] = vaddq_s32(x2[29], x3[26]);
- x4[27] = vsubq_s32(x2[28], x3[27]);
- x4[28] = vaddq_s32(x2[28], x3[27]);
-
- btf_32_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
- *v_cos_bit);
- btf_32_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
- *v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
- *v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
- *v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
- *v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
- *v_cos_bit);
-}
-
-static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
- int8_t cos_bit, const int instride,
- const int outstride,
- const int8_t *stage_range) {
- (void)stage_range;
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
- int startidx = 0 * instride;
- int endidx = 63 * instride;
-
- // stage 1-2-3-4
- int32x4_t x3[64], x4[64];
- av1_fdct64_new_stage1234_neon(input, instride, x3, x4, cospi, &v_cos_bit,
- &startidx, &endidx);
+ int32x4_t x4[64];
+ butterfly_dct_pre_s32_x4(x3, x4, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+ butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
// stage 5
int32x4_t x5[64];
- x5[0] = vaddq_s32(x4[0], x4[3]);
- x5[3] = vsubq_s32(x4[0], x4[3]);
- x5[1] = vaddq_s32(x4[1], x4[2]);
- x5[2] = vsubq_s32(x4[1], x4[2]);
-
- btf_32_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
- v_cos_bit);
-
- x5[8] = vaddq_s32(x3[8], x4[11]);
- x5[11] = vsubq_s32(x3[8], x4[11]);
- x5[9] = vaddq_s32(x3[9], x4[10]);
- x5[10] = vsubq_s32(x3[9], x4[10]);
- x5[12] = vsubq_s32(x3[15], x4[12]);
- x5[15] = vaddq_s32(x3[15], x4[12]);
- x5[13] = vsubq_s32(x3[14], x4[13]);
- x5[14] = vaddq_s32(x3[14], x4[13]);
-
- btf_32_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
- v_cos_bit);
- btf_32_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
- v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
- v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
- v_cos_bit);
-
- x5[32] = vaddq_s32(x3[32], x4[39]);
- x5[39] = vsubq_s32(x3[32], x4[39]);
- x5[33] = vaddq_s32(x3[33], x4[38]);
- x5[38] = vsubq_s32(x3[33], x4[38]);
- x5[34] = vaddq_s32(x3[34], x4[37]);
- x5[37] = vsubq_s32(x3[34], x4[37]);
- x5[35] = vaddq_s32(x3[35], x4[36]);
- x5[36] = vsubq_s32(x3[35], x4[36]);
- x5[40] = vsubq_s32(x3[47], x4[40]);
- x5[47] = vaddq_s32(x3[47], x4[40]);
- x5[41] = vsubq_s32(x3[46], x4[41]);
- x5[46] = vaddq_s32(x3[46], x4[41]);
- x5[42] = vsubq_s32(x3[45], x4[42]);
- x5[45] = vaddq_s32(x3[45], x4[42]);
- x5[43] = vsubq_s32(x3[44], x4[43]);
- x5[44] = vaddq_s32(x3[44], x4[43]);
- x5[48] = vaddq_s32(x3[48], x4[55]);
- x5[55] = vsubq_s32(x3[48], x4[55]);
- x5[49] = vaddq_s32(x3[49], x4[54]);
- x5[54] = vsubq_s32(x3[49], x4[54]);
- x5[50] = vaddq_s32(x3[50], x4[53]);
- x5[53] = vsubq_s32(x3[50], x4[53]);
- x5[51] = vaddq_s32(x3[51], x4[52]);
- x5[52] = vsubq_s32(x3[51], x4[52]);
- x5[56] = vsubq_s32(x3[63], x4[56]);
- x5[63] = vaddq_s32(x3[63], x4[56]);
- x5[57] = vsubq_s32(x3[62], x4[57]);
- x5[62] = vaddq_s32(x3[62], x4[57]);
- x5[58] = vsubq_s32(x3[61], x4[58]);
- x5[61] = vaddq_s32(x3[61], x4[58]);
- x5[59] = vsubq_s32(x3[60], x4[59]);
- x5[60] = vaddq_s32(x3[60], x4[59]);
+ butterfly_dct_pre_s32_x4(x4, x5, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+ butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+ butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
// stage 6
int32x4_t x6[64];
- btf_32_neon(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1], v_cos_bit);
- btf_32_type1_neon(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
- v_cos_bit);
- x6[4] = vaddq_s32(x4[4], x5[5]);
- x6[5] = vsubq_s32(x4[4], x5[5]);
- x6[6] = vsubq_s32(x4[7], x5[6]);
- x6[7] = vaddq_s32(x4[7], x5[6]);
- btf_32_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
- v_cos_bit);
- btf_32_neon_mode01(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
- v_cos_bit);
-
- x6[16] = vaddq_s32(x4[16], x5[19]);
- x6[19] = vsubq_s32(x4[16], x5[19]);
- x6[17] = vaddq_s32(x4[17], x5[18]);
- x6[18] = vsubq_s32(x4[17], x5[18]);
- x6[20] = vsubq_s32(x4[23], x5[20]);
- x6[23] = vaddq_s32(x4[23], x5[20]);
- x6[21] = vsubq_s32(x4[22], x5[21]);
- x6[22] = vaddq_s32(x4[22], x5[21]);
- x6[24] = vaddq_s32(x4[24], x5[27]);
- x6[27] = vsubq_s32(x4[24], x5[27]);
- x6[25] = vaddq_s32(x4[25], x5[26]);
- x6[26] = vsubq_s32(x4[25], x5[26]);
- x6[28] = vsubq_s32(x4[31], x5[28]);
- x6[31] = vaddq_s32(x4[31], x5[28]);
- x6[29] = vsubq_s32(x4[30], x5[29]);
- x6[30] = vaddq_s32(x4[30], x5[29]);
-
- btf_32_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
- v_cos_bit);
- btf_32_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
- v_cos_bit);
- btf_32_neon_mode01(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
- v_cos_bit);
- btf_32_neon_mode01(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
- v_cos_bit);
- btf_32_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
- v_cos_bit);
- btf_32_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
- v_cos_bit);
- btf_32_neon_mode01(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
- v_cos_bit);
- btf_32_neon_mode01(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
- v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+ butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+ butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
// stage 7
int32x4_t x7[64];
-
- btf_32_type1_neon(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
- btf_32_type1_neon(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
- v_cos_bit);
- x7[8] = vaddq_s32(x5[8], x6[9]);
- x7[9] = vsubq_s32(x5[8], x6[9]);
- x7[10] = vsubq_s32(x5[11], x6[10]);
- x7[11] = vaddq_s32(x5[11], x6[10]);
- x7[12] = vaddq_s32(x5[12], x6[13]);
- x7[13] = vsubq_s32(x5[12], x6[13]);
- x7[14] = vsubq_s32(x5[15], x6[14]);
- x7[15] = vaddq_s32(x5[15], x6[14]);
-
- btf_32_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
- v_cos_bit);
- btf_32_neon_mode01(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
- v_cos_bit);
-
- btf_32_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
- v_cos_bit);
- btf_32_neon_mode01(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
- v_cos_bit);
-
- x7[32] = vaddq_s32(x5[32], x6[35]);
- x7[35] = vsubq_s32(x5[32], x6[35]);
- x7[33] = vaddq_s32(x5[33], x6[34]);
- x7[34] = vsubq_s32(x5[33], x6[34]);
- x7[36] = vsubq_s32(x5[39], x6[36]);
- x7[39] = vaddq_s32(x5[39], x6[36]);
- x7[37] = vsubq_s32(x5[38], x6[37]);
- x7[38] = vaddq_s32(x5[38], x6[37]);
- x7[40] = vaddq_s32(x5[40], x6[43]);
- x7[43] = vsubq_s32(x5[40], x6[43]);
- x7[41] = vaddq_s32(x5[41], x6[42]);
- x7[42] = vsubq_s32(x5[41], x6[42]);
- x7[44] = vsubq_s32(x5[47], x6[44]);
- x7[47] = vaddq_s32(x5[47], x6[44]);
- x7[45] = vsubq_s32(x5[46], x6[45]);
- x7[46] = vaddq_s32(x5[46], x6[45]);
- x7[48] = vaddq_s32(x5[48], x6[51]);
- x7[51] = vsubq_s32(x5[48], x6[51]);
- x7[49] = vaddq_s32(x5[49], x6[50]);
- x7[50] = vsubq_s32(x5[49], x6[50]);
- x7[52] = vsubq_s32(x5[55], x6[52]);
- x7[55] = vaddq_s32(x5[55], x6[52]);
- x7[53] = vsubq_s32(x5[54], x6[53]);
- x7[54] = vaddq_s32(x5[54], x6[53]);
- x7[56] = vaddq_s32(x5[56], x6[59]);
- x7[59] = vsubq_s32(x5[56], x6[59]);
- x7[57] = vaddq_s32(x5[57], x6[58]);
- x7[58] = vsubq_s32(x5[57], x6[58]);
- x7[60] = vsubq_s32(x5[63], x6[60]);
- x7[63] = vaddq_s32(x5[63], x6[60]);
- x7[61] = vsubq_s32(x5[62], x6[61]);
- x7[62] = vaddq_s32(x5[62], x6[61]);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+ butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+ butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
// stage 8
int32x4_t x8[64];
-
- btf_32_type1_neon(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
- v_cos_bit);
- btf_32_type1_neon(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
- v_cos_bit);
- btf_32_type1_neon(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
- v_cos_bit);
- btf_32_type1_neon(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
- v_cos_bit);
- x8[16] = vaddq_s32(x6[16], x7[17]);
- x8[17] = vsubq_s32(x6[16], x7[17]);
- x8[18] = vsubq_s32(x6[19], x7[18]);
- x8[19] = vaddq_s32(x6[19], x7[18]);
- x8[20] = vaddq_s32(x6[20], x7[21]);
- x8[21] = vsubq_s32(x6[20], x7[21]);
- x8[22] = vsubq_s32(x6[23], x7[22]);
- x8[23] = vaddq_s32(x6[23], x7[22]);
- x8[24] = vaddq_s32(x6[24], x7[25]);
- x8[25] = vsubq_s32(x6[24], x7[25]);
- x8[26] = vsubq_s32(x6[27], x7[26]);
- x8[27] = vaddq_s32(x6[27], x7[26]);
- x8[28] = vaddq_s32(x6[28], x7[29]);
- x8[29] = vsubq_s32(x6[28], x7[29]);
- x8[30] = vsubq_s32(x6[31], x7[30]);
- x8[31] = vaddq_s32(x6[31], x7[30]);
-
- btf_32_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
- v_cos_bit);
- btf_32_neon_mode01(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
- v_cos_bit);
- btf_32_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
- v_cos_bit);
- btf_32_neon_mode01(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
- v_cos_bit);
- btf_32_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
- v_cos_bit);
- btf_32_neon_mode01(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
- v_cos_bit);
- btf_32_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
- v_cos_bit);
- btf_32_neon_mode01(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
- v_cos_bit);
+ butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+ butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+ butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+ butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+ butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+ butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
// stage 9
int32x4_t x9[64];
-
- btf_32_type1_neon(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
- v_cos_bit);
- btf_32_type1_neon(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
- v_cos_bit);
- btf_32_type1_neon(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
- v_cos_bit);
- btf_32_type1_neon(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
- v_cos_bit);
- btf_32_type1_neon(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
- v_cos_bit);
- btf_32_type1_neon(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
- v_cos_bit);
- btf_32_type1_neon(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
- v_cos_bit);
- btf_32_type1_neon(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
- v_cos_bit);
- x9[32] = vaddq_s32(x7[32], x8[33]);
- x9[33] = vsubq_s32(x7[32], x8[33]);
- x9[34] = vsubq_s32(x7[35], x8[34]);
- x9[35] = vaddq_s32(x7[35], x8[34]);
- x9[36] = vaddq_s32(x7[36], x8[37]);
- x9[37] = vsubq_s32(x7[36], x8[37]);
- x9[38] = vsubq_s32(x7[39], x8[38]);
- x9[39] = vaddq_s32(x7[39], x8[38]);
- x9[40] = vaddq_s32(x7[40], x8[41]);
- x9[41] = vsubq_s32(x7[40], x8[41]);
- x9[42] = vsubq_s32(x7[43], x8[42]);
- x9[43] = vaddq_s32(x7[43], x8[42]);
- x9[44] = vaddq_s32(x7[44], x8[45]);
- x9[45] = vsubq_s32(x7[44], x8[45]);
- x9[46] = vsubq_s32(x7[47], x8[46]);
- x9[47] = vaddq_s32(x7[47], x8[46]);
- x9[48] = vaddq_s32(x7[48], x8[49]);
- x9[49] = vsubq_s32(x7[48], x8[49]);
- x9[50] = vsubq_s32(x7[51], x8[50]);
- x9[51] = vaddq_s32(x7[51], x8[50]);
- x9[52] = vaddq_s32(x7[52], x8[53]);
- x9[53] = vsubq_s32(x7[52], x8[53]);
- x9[54] = vsubq_s32(x7[55], x8[54]);
- x9[55] = vaddq_s32(x7[55], x8[54]);
- x9[56] = vaddq_s32(x7[56], x8[57]);
- x9[57] = vsubq_s32(x7[56], x8[57]);
- x9[58] = vsubq_s32(x7[59], x8[58]);
- x9[59] = vaddq_s32(x7[59], x8[58]);
- x9[60] = vaddq_s32(x7[60], x8[61]);
- x9[61] = vsubq_s32(x7[60], x8[61]);
- x9[62] = vsubq_s32(x7[63], x8[62]);
- x9[63] = vaddq_s32(x7[63], x8[62]);
+ butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+ butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+ butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+ butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+ butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+ butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+ butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+ butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+ butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
// stage 10
int32x4_t x10[64];
-
- btf_32_type1_neon(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
- v_cos_bit);
- btf_32_type1_neon(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
- v_cos_bit);
- btf_32_type1_neon(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
- v_cos_bit);
- btf_32_type1_neon(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
- v_cos_bit);
- btf_32_type1_neon(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
- v_cos_bit);
- btf_32_type1_neon(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
- v_cos_bit);
- btf_32_type1_neon(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
- v_cos_bit);
- btf_32_type1_neon(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
- v_cos_bit);
- btf_32_type1_neon(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
- v_cos_bit);
- btf_32_type1_neon(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
- v_cos_bit);
- btf_32_type1_neon(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
- v_cos_bit);
- btf_32_type1_neon(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
- v_cos_bit);
- btf_32_type1_neon(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
- v_cos_bit);
- btf_32_type1_neon(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
- v_cos_bit);
- btf_32_type1_neon(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
- v_cos_bit);
- btf_32_type1_neon(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
- v_cos_bit);
-
- startidx = 0 * outstride;
- endidx = 63 * outstride;
- // stage 11
- output[startidx] = x6[0];
- output[endidx] = x10[63];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[32];
- output[endidx] = x9[31];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[16];
- output[endidx] = x10[47];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[48];
- output[endidx] = x8[15];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x8[8];
- output[endidx] = x10[55];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[40];
- output[endidx] = x9[23];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[24];
- output[endidx] = x10[39];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[56];
- output[endidx] = x7[7];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x7[4];
- output[endidx] = x10[59];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[36];
- output[endidx] = x9[27];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[20];
- output[endidx] = x10[43];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[52];
- output[endidx] = x8[11];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x8[12];
- output[endidx] = x10[51];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[44];
- output[endidx] = x9[19];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[28];
- output[endidx] = x10[35];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[60];
- output[endidx] = x6[3];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x6[2];
- output[endidx] = x10[61];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[34];
- output[endidx] = x9[29];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[18];
- output[endidx] = x10[45];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[50];
- output[endidx] = x8[13];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x8[10];
- output[endidx] = x10[53];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[42];
- output[endidx] = x9[21];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[26];
- output[endidx] = x10[37];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[58];
- output[endidx] = x7[5];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x7[6];
- output[endidx] = x10[57];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[38];
- output[endidx] = x9[25];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[22];
- output[endidx] = x10[41];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[54];
- output[endidx] = x8[9];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x8[14];
- output[endidx] = x10[49];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[46];
- output[endidx] = x9[17];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x9[30];
- output[endidx] = x10[33];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[62];
- output[endidx] = x6[1];
+ butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
+ butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
+ butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
+ butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
+ butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
+ butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
+ butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
+ butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
+ butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
+ butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
+ butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
+ butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
+ butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
+ butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
+ butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
+ butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
+
+ // stage 11, only store into the low 32 output indices.
+ output[0] = x6[0];
+ output[1] = x10[32];
+ output[2] = x9[16];
+ output[3] = x10[48];
+ output[4] = x8[8];
+ output[5] = x10[40];
+ output[6] = x9[24];
+ output[7] = x10[56];
+ output[8] = x7[4];
+ output[9] = x10[36];
+ output[10] = x9[20];
+ output[11] = x10[52];
+ output[12] = x8[12];
+ output[13] = x10[44];
+ output[14] = x9[28];
+ output[15] = x10[60];
+ output[16] = x6[2];
+ output[17] = x10[34];
+ output[18] = x9[18];
+ output[19] = x10[50];
+ output[20] = x8[10];
+ output[21] = x10[42];
+ output[22] = x9[26];
+ output[23] = x10[58];
+ output[24] = x7[6];
+ output[25] = x10[38];
+ output[26] = x9[22];
+ output[27] = x10[54];
+ output[28] = x8[14];
+ output[29] = x10[46];
+ output[30] = x9[30];
+ output[31] = x10[62];
}
-static void av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t *input,
- int32_t *output, int stride,
- TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
(void)tx_type;
assert(tx_type == DCT_DCT);
- const TX_SIZE tx_size = TX_64X64;
int16x8_t buf0[64], buf1[512];
- const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = tx_size_wide[tx_size];
- const int height = tx_size_high[tx_size];
- const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
- const int width_div8 = (width >> 3);
- const int height_div8 = (height >> 3);
-
- for (int i = 0; i < width_div8; i++) {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- round_shift_16bit(buf0, height, shift[0]);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit(buf0, height, shift[1]);
- for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
- transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 8; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
}
}
- for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ for (int i = 0; i < 4; i++) {
int32x4_t bufA[64];
int32x4_t bufB[64];
- int16x8_t *buf = buf1 + width * i;
- for (int j = 0; j < width; ++j) {
+ int16x8_t *buf = buf1 + 64 * i;
+ for (int j = 0; j < 64; ++j) {
bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
}
- av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
- av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
- av1_round_shift_array_32_neon(bufA, bufA, 32);
- av1_round_shift_array_32_neon(bufB, bufB, 32);
-
- store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ fdct64_new_neon(bufA, bufA, 10);
+ fdct64_new_neon(bufB, bufB, 10);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
}
}
-static void av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t *input,
- int32_t *output, int stride,
- TX_TYPE tx_type, int bd) {
+
+static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- const TX_SIZE tx_size = TX_64X32;
int16x8_t buf0[64], buf1[256];
- const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = tx_size_wide[tx_size];
- const int height = tx_size_high[tx_size];
- const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
- const int width_div8 = (width >> 3);
- const int height_div8 = (height >> 3);
-
- for (int i = 0; i < width_div8; i++) {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- round_shift_16bit(buf0, height, shift[0]);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit(buf0, height, shift[1]);
- for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
- transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+
+ for (int i = 0; i < 8; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
}
}
assert(tx_type == DCT_DCT);
- for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ for (int i = 0; i < 4; i++) {
int32x4_t bufA[64];
int32x4_t bufB[64];
- int16x8_t *buf = buf1 + width * i;
- for (int j = 0; j < width; ++j) {
+ int16x8_t *buf = buf1 + 64 * i;
+ for (int j = 0; j < 64; ++j) {
bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
}
- av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
- av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
- av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
- av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
-
- store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ fdct64_new_neon(bufA, bufA, 11);
+ fdct64_new_neon(bufB, bufB, 11);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
}
}
-static void av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t *input,
- int32_t *output, int stride,
- TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
(void)bd;
(void)tx_type;
assert(tx_type == DCT_DCT);
- const TX_SIZE tx_size = TX_32X64;
int16x8_t buf0[64], buf1[256];
- const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int width = tx_size_wide[tx_size];
- const int height = tx_size_high[tx_size];
- const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
- const int width_div8 = (width >> 3);
- const int height_div8 = (height >> 3);
-
- for (int i = 0; i < width_div8; i++) {
- load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
- round_shift_16bit(buf0, height, shift[0]);
- col_txfm(buf0, buf0, cos_bit_col, NULL);
- round_shift_16bit(buf0, height, shift[1]);
- for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
- transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 4; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
}
}
- for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ for (int i = 0; i < 4; i++) {
int32x4_t bufA[32];
int32x4_t bufB[32];
- int16x8_t *buf = buf1 + width * i;
- for (int j = 0; j < width; ++j) {
+ int16x8_t *buf = buf1 + 32 * i;
+ for (int j = 0; j < 32; ++j) {
bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
}
- av1_fdct32_new_neon(bufA, bufA, cos_bit_row, 1, NULL);
- av1_fdct32_new_neon(bufB, bufB, cos_bit_row, 1, NULL);
- av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
- av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
-
- store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ fdct32_new_neon(bufA, bufA, 11);
+ fdct32_new_neon(bufB, bufB, 11);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
}
}
static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
- av1_lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform
- av1_lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform
- av1_lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform
- av1_lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform
- av1_lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform
- av1_lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform
- av1_lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform
- av1_lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform
- av1_lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform
- av1_lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform
- av1_lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform
- av1_lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform
- av1_lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform
- av1_lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform
- av1_lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform
- av1_lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform
- av1_lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform
- av1_lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform
- av1_lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform
+ lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform
+ lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform
+ lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform
+ lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform
+ lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform
+ lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform
+ lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform
+ lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform
+ lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform
+ lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform
+ lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform
+ lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform
+ lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform
+ lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform
+ lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform
+ lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform
};
void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/neon/encodetxb_neon.c
index ee936088f..582863a27 100644
--- a/av1/encoder/arm/neon/encodetxb_neon.c
+++ b/av1/encoder/arm/neon/encodetxb_neon.c
@@ -57,10 +57,7 @@ void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
} while (i < width);
} else if (height == 8) {
do {
- const int32x4_t coeffA = vld1q_s32(cf);
- const int32x4_t coeffB = vld1q_s32(cf + 4);
- const int16x8_t coeffAB =
- vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+ const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
const int16x8_t absAB = vqabsq_s16(coeffAB);
const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
@@ -73,14 +70,8 @@ void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
do {
int j = 0;
do {
- const int32x4_t coeffA = vld1q_s32(cf);
- const int32x4_t coeffB = vld1q_s32(cf + 4);
- const int32x4_t coeffC = vld1q_s32(cf + 8);
- const int32x4_t coeffD = vld1q_s32(cf + 12);
- const int16x8_t coeffAB =
- vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
- const int16x8_t coeffCD =
- vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
+ const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+ const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
const int16x8_t absAB = vqabsq_s16(coeffAB);
const int16x8_t absCD = vqabsq_s16(coeffCD);
const uint8x16_t absABCD = vreinterpretq_u8_s8(
@@ -282,7 +273,7 @@ static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
uint8x16_t pos_to_offset =
- vld1q_u8((width == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);
+ (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);
uint8x16_t count;
uint8x16_t level[5];
diff --git a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
index 15d375a25..aa64a3890 100644
--- a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
+++ b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -12,2112 +12,1586 @@
#include <arm_neon.h>
#include <assert.h>
-#include "av1/common/av1_txfm.h"
-#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/arm/transpose_neon.h"
#include "aom_dsp/txfm_common.h"
#include "aom_ports/mem.h"
-#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "config/aom_config.h"
-
-static INLINE void store_output_w4(int32_t *const out,
- const int32x4_t *const in, const int stride,
- const int out_size) {
- for (int i = 0; i < out_size; ++i) {
- vst1q_s32(out + i * stride, in[i]);
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
+ int32x4_t *out) {
+ // This is not quite the same as the other transposes defined in
+ // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
+ // unused by the following row transform.
+ for (int j = 0; j < 8; ++j) {
+ for (int i = 0; i < 16; ++i) {
+ transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
+ }
}
}
-static INLINE int32x4_t half_btf_neon(const int32_t *w0, const int32x4_t *n0,
- const int32_t *w1, const int32x4_t *n1,
- const int32x4_t v_bit) {
- int32x4_t x;
- x = vmulq_n_s32(*n0, *w0);
- x = vmlaq_n_s32(x, *n1, *w1);
- x = vrshlq_s32(x, v_bit);
- return x;
-}
-
-static INLINE int32x4_t half_btf_neon_m(const int32_t *w0, const int32x4_t *n0,
- const int32_t *w1, const int32x4_t *n1,
- const int32x4_t v_bit) {
- int32x4_t x;
- x = vmulq_n_s32(*n0, *w0);
- x = vmlsq_n_s32(x, *n1, *w1);
- x = vrshlq_s32(x, v_bit);
- return x;
-}
-
-#if AOM_ARCH_AARCH64
-#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
- do { \
- int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
- int32x4x2_t swap_high = vtrnq_s32(x2, x3); \
- y0 = vreinterpretq_s32_s64( \
- vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \
- vreinterpretq_s64_s32(swap_high.val[0]))); \
- y1 = vreinterpretq_s32_s64( \
- vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \
- vreinterpretq_s64_s32(swap_high.val[1]))); \
- y2 = vreinterpretq_s32_s64( \
- vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \
- vreinterpretq_s64_s32(swap_high.val[0]))); \
- y3 = vreinterpretq_s32_s64( \
- vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \
- vreinterpretq_s64_s32(swap_high.val[1]))); \
- } while (0)
-#else
-#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
- do { \
- int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
- int32x4x2_t swap_high = vtrnq_s32(x2, x3); \
- y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2), \
- swap_high.val[0], 2); \
- y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2), \
- swap_high.val[1], 2); \
- y2 = vextq_s32(swap_low.val[0], \
- vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
- y3 = vextq_s32(swap_low.val[1], \
- vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
- } while (0)
-#endif // AOM_ARCH_AARCH64
-
-static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
- TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
-}
-
-static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
- TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
- TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
- TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
- TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
- out[15]);
-}
-
-static INLINE void transpose_16x16(const int32x4_t *in, int32x4_t *out) {
- // Upper left 8x8
- TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
- TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
- out[28]);
- TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
- out[13]);
- TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
- out[29]);
-
- // Upper right 8x8
- TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
- out[44]);
- TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
- out[60]);
- TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
- out[45]);
- TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
- out[61]);
-
- // Lower left 8x8
- TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
- out[14]);
- TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
- out[30]);
- TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
- out[15]);
- TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
- out[31]);
- // Lower right 8x8
- TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
- out[46]);
- TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
- out[62]);
- TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
- out[47]);
- TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
- out[63]);
+// A note on butterfly helper naming:
+//
+// butterfly_[weight_indices]_neon
+// e.g. butterfly_0312_neon
+// ^ Weights are applied as indices 0, 3, 2, 1
+// (see more detail below)
+//
+// Weight indices are treated as an index into the 4-tuple of the weight
+// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
+// This is then represented in the helper naming by referring to the lane index
+// in the loaded tuple that each multiply is performed with:
+//
+// in0 in1
+// /------------
+// out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1]
+// out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3]
+//
+// So for indices 0321 from the earlier example, we end up with:
+//
+// in0 in1
+// /------------------
+// out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1)
+// out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0)
+
+#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \
+ do { \
+ int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \
+ int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
+ x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \
+ *out = vrshlq_s32(x, v_bit); \
+ } while (false)
+
+static AOM_FORCE_INLINE void butterfly_0112_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_2312_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0332_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0130_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
+ const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+ int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+ butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
+ const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+ int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+ butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
+ int32x4_t *output,
+ const int size) {
+ const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+ int i = 0;
+ do {
+ const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
+ output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+ } while (++i < size);
}
-static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
- int32x4_t *output,
- const int size,
- const int bit,
- const int val) {
- const int32x4_t sqrt2 = vdupq_n_s32(val);
- const int32x4_t v_bit = vdupq_n_s32(-bit);
- int i;
- for (i = 0; i < size; i++) {
- const int32x4_t r0 = vrshlq_s32(input[i], v_bit);
- const int32x4_t r1 = vmulq_s32(sqrt2, r0);
+static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
+ const int32x4_t *input, int32x4_t *output, const int size) {
+ const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+ int i = 0;
+ do {
+ const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
+ const int32x4_t r1 = vmulq_s32(r0, sqrt2);
output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+ } while (++i < size);
+}
+
+#define LOAD_BUFFER_4XH(h) \
+ static AOM_FORCE_INLINE void load_buffer_4x##h( \
+ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+ if (fliplr) { \
+ for (int i = 0; i < (h); ++i) { \
+ int16x4_t a = vld1_s16(input + i * stride); \
+ a = vrev64_s16(a); \
+ in[i] = vshll_n_s16(a, 2); \
+ } \
+ } else { \
+ for (int i = 0; i < (h); ++i) { \
+ int16x4_t a = vld1_s16(input + i * stride); \
+ in[i] = vshll_n_s16(a, 2); \
+ } \
+ } \
}
-}
-#define btf_32_neon_type0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
- do { \
- out0 = vmulq_n_s32(in0, w0); \
- out0 = vmlaq_n_s32(out0, in1, w1); \
- out0 = vrshlq_s32(out0, v_cos_bit); \
- out1 = vmulq_n_s32(in0, w1); \
- out1 = vmlsq_n_s32(out1, in1, w0); \
- out1 = vrshlq_s32(out1, v_cos_bit); \
- } while (0)
-
-#define btf_32_neon_type1(w0, w1, in0, in1, out0, out1, bit) \
- do { \
- btf_32_neon_type0(w1, w0, in1, in0, out0, out1, bit); \
- } while (0)
-
-static INLINE void load_buffer_4x4(const int16_t *input, int32x4_t *in,
- int stride, int flipud, int fliplr,
- const int32x4_t *v_shift) {
- int16x4_t v0, v1, v2, v3;
-
- if (!flipud) {
- v0 = vld1_s16(input + 0 * stride);
- v1 = vld1_s16(input + 1 * stride);
- v2 = vld1_s16(input + 2 * stride);
- v3 = vld1_s16(input + 3 * stride);
- } else {
- v0 = vld1_s16(input + 3 * stride);
- v1 = vld1_s16(input + 2 * stride);
- v2 = vld1_s16(input + 1 * stride);
- v3 = vld1_s16(input + 0 * stride);
+// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
+// avoid the expression even though the compiler can prove that the code path
+// is never taken if `shift == 0`.
+#define shift_left_long_s16(a, shift) \
+ ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
+
+#define LOAD_BUFFER_WXH(w, h, shift) \
+ static AOM_FORCE_INLINE void load_buffer_##w##x##h( \
+ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+ assert(w >= 8); \
+ if (fliplr) { \
+ for (int i = 0; i < (h); ++i) { \
+ for (int j = 0; j < (w) / 8; ++j) { \
+ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
+ a = vrev64q_s16(a); \
+ int j2 = (w) / 8 - j - 1; \
+ in[i + (h) * (2 * j2 + 0)] = \
+ shift_left_long_s16(vget_high_s16(a), (shift)); \
+ in[i + (h) * (2 * j2 + 1)] = \
+ shift_left_long_s16(vget_low_s16(a), (shift)); \
+ } \
+ } \
+ } else { \
+ for (int i = 0; i < (h); ++i) { \
+ for (int j = 0; j < (w) / 8; ++j) { \
+ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
+ in[i + (h) * (2 * j + 0)] = \
+ shift_left_long_s16(vget_low_s16(a), (shift)); \
+ in[i + (h) * (2 * j + 1)] = \
+ shift_left_long_s16(vget_high_s16(a), (shift)); \
+ } \
+ } \
+ } \
}
- if (fliplr) {
- v0 = vrev64_s16(v0);
- v1 = vrev64_s16(v1);
- v2 = vrev64_s16(v2);
- v3 = vrev64_s16(v3);
+LOAD_BUFFER_4XH(4)
+LOAD_BUFFER_4XH(8)
+LOAD_BUFFER_4XH(16)
+LOAD_BUFFER_4XH(32)
+LOAD_BUFFER_WXH(8, 8, 2)
+LOAD_BUFFER_WXH(16, 16, 2)
+LOAD_BUFFER_WXH(32, 64, 0)
+LOAD_BUFFER_WXH(64, 32, 2)
+LOAD_BUFFER_WXH(64, 64, 0)
+
+#if !CONFIG_REALTIME_ONLY
+LOAD_BUFFER_WXH(16, 64, 0)
+LOAD_BUFFER_WXH(64, 16, 2)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define STORE_BUFFER_WXH(w, h) \
+ static AOM_FORCE_INLINE void store_buffer_##w##x##h( \
+ const int32x4_t *in, int32_t *out, int stride) { \
+ for (int i = 0; i < (w); ++i) { \
+ for (int j = 0; j < (h) / 4; ++j) { \
+ vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
+ } \
+ } \
}
- in[0] = vshlq_s32(vmovl_s16(v0), *v_shift);
- in[1] = vshlq_s32(vmovl_s16(v1), *v_shift);
- in[2] = vshlq_s32(vmovl_s16(v2), *v_shift);
- in[3] = vshlq_s32(vmovl_s16(v3), *v_shift);
-}
-static void fdct4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
- const int num_col) {
- const int32_t *cospi = cospi_arr(bit);
- const int32x4_t cospi32 = vdupq_n_s32(cospi[32]);
- const int32x4_t cospi48 = vdupq_n_s32(cospi[48]);
- const int32x4_t cospi16 = vdupq_n_s32(cospi[16]);
- int32x4_t s0, s1, s2, s3;
- int32x4_t u0, u1, u2, u3;
- int32x4_t v0, v2;
-
- int endidx = 3 * num_col;
- s0 = vaddq_s32(in[0], in[endidx]);
- s3 = vsubq_s32(in[0], in[endidx]);
- endidx -= num_col;
- s1 = vaddq_s32(in[num_col], in[endidx]);
- s2 = vsubq_s32(in[num_col], in[endidx]);
-
- u0 = vmulq_s32(s0, cospi32);
- u1 = vmulq_s32(s1, cospi32);
- u2 = vaddq_s32(u0, u1);
- v0 = vsubq_s32(u0, u1);
+STORE_BUFFER_WXH(4, 4)
+STORE_BUFFER_WXH(8, 4)
+STORE_BUFFER_WXH(8, 8)
+STORE_BUFFER_WXH(16, 4)
+STORE_BUFFER_WXH(16, 16)
+STORE_BUFFER_WXH(32, 4)
+STORE_BUFFER_WXH(32, 32)
+STORE_BUFFER_WXH(64, 32)
+
+#if !CONFIG_REALTIME_ONLY
+STORE_BUFFER_WXH(16, 32)
+STORE_BUFFER_WXH(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
+ const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
+
+ const int32x4_t a0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t a1 = vsubq_s32(in[0], in[3]);
+ const int32x4_t a2 = vaddq_s32(in[1], in[2]);
+ const int32x4_t a3 = vsubq_s32(in[1], in[2]);
+
+ const int32x4_t b0 = vmulq_s32(a0, cospi32);
+ const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
+ const int32x4_t b2 = vmulq_s32(a2, cospi32);
+ const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
+
+ const int32x4_t c0 = vaddq_s32(b0, b2);
+ const int32x4_t c1 = vsubq_s32(b0, b2);
+ const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
+ const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
+
const int32x4_t v_bit = vdupq_n_s32(-bit);
- u0 = vrshlq_s32(u2, v_bit);
- u2 = vrshlq_s32(v0, v_bit);
+ const int32x4_t d0 = vrshlq_s32(c0, v_bit);
+ const int32x4_t d1 = vrshlq_s32(c1, v_bit);
+ const int32x4_t d2 = vrshlq_s32(c2, v_bit);
+ const int32x4_t d3 = vrshlq_s32(c3, v_bit);
- v0 = vmulq_s32(s2, cospi48);
- v2 = vmlaq_s32(v0, s3, cospi16);
+ out[0] = d0;
+ out[1] = d2;
+ out[2] = d1;
+ out[3] = d3;
+}
- u1 = vrshlq_s32(v2, v_bit);
+static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
- v0 = vmulq_s32(s3, cospi48);
- v2 = vmlsq_s32(v0, s2, cospi16);
+ const int32x4_t a0 = vaddq_s32(in[0], in[1]);
+ const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
+ const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
+ const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
- u3 = vrshlq_s32(v2, v_bit);
+ const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
+ const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
+ const int32x4_t b2 = vsubq_s32(a0, in[3]);
- out[0] = u0;
- out[1] = u1;
- out[2] = u2;
- out[3] = u3;
-}
+ const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
+ const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
+ const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
-static INLINE void write_buffer_4x4(int32x4_t *res, int32_t *output) {
- vst1q_s32((output + 0 * 4), res[0]);
- vst1q_s32((output + 1 * 4), res[1]);
- vst1q_s32((output + 2 * 4), res[2]);
- vst1q_s32((output + 3 * 4), res[3]);
-}
+ const int32x4_t d0 = vaddq_s32(c0, a3);
+ const int32x4_t d1 = vsubq_s32(c1, a3);
+ const int32x4_t d2 = vsubq_s32(c1, c0);
-static void fadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
- const int num_col) {
- const int32_t *sinpi = sinpi_arr(bit);
- const int32x4_t sinpi4x = vld1q_s32(&sinpi[1]);
-
- const int32x4_t sinpi1 = vdupq_lane_s32(vget_low_s32(sinpi4x), 0);
- const int32x4_t sinpi2 = vdupq_lane_s32(vget_low_s32(sinpi4x), 1);
- const int32x4_t sinpi3 = vdupq_lane_s32(vget_high_s32(sinpi4x), 0);
- const int32x4_t sinpi4 = vdupq_lane_s32(vget_high_s32(sinpi4x), 1);
- int32x4_t t;
- int32x4_t s0, s1, s2, s3, s7;
- int32x4_t x0, x1, x2, x3;
-
- int idx = 0 * num_col;
- s0 = vmulq_s32(in[idx], sinpi1);
- s1 = vmulq_s32(in[idx], sinpi4);
- t = vaddq_s32(in[idx], in[idx + num_col]);
- idx += 2 * num_col;
- x3 = vmulq_s32(in[idx], sinpi3);
- idx += num_col;
- s7 = vsubq_s32(t, in[idx]);
-
- t = vmlaq_s32(s0, in[idx - 2 * num_col], sinpi2);
- x0 = vmlaq_s32(t, in[idx], sinpi4);
- x1 = vmulq_s32(s7, sinpi3);
- t = vmlsq_s32(s1, in[idx - 2 * num_col], sinpi1);
- x2 = vmlaq_s32(t, in[idx], sinpi2);
-
- s0 = vaddq_s32(x0, x3);
- s1 = x1;
- s2 = vsubq_s32(x2, x3);
- t = vsubq_s32(x2, x0);
- s3 = vaddq_s32(t, x3);
+ const int32x4_t e0 = vaddq_s32(d2, a3);
const int32x4_t v_bit = vdupq_n_s32(-bit);
- out[0] = vrshlq_s32(s0, v_bit);
- out[1] = vrshlq_s32(s1, v_bit);
- out[2] = vrshlq_s32(s2, v_bit);
- out[3] = vrshlq_s32(s3, v_bit);
+ out[0] = vrshlq_s32(d0, v_bit);
+ out[1] = vrshlq_s32(c2, v_bit);
+ out[2] = vrshlq_s32(d1, v_bit);
+ out[3] = vrshlq_s32(e0, v_bit);
}
-static void idtx4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+
+static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
+ int32x4_t *out,
+ int bit) {
(void)bit;
int32x4_t fact = vdupq_n_s32(NewSqrt2);
- int32x4_t a_low;
- int i;
- for (i = 0; i < 4; i++) {
- a_low = vmulq_s32(in[i * col_num], fact);
+ for (int i = 0; i < 4; i++) {
+ const int32x4_t a_low = vmulq_s32(in[i], fact);
out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
}
}
+
void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
int input_stride, TX_TYPE tx_type, int bd) {
- int32x4_t in[4];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
- const int txw_idx = get_txw_idx(TX_4X4);
- const int txh_idx = get_txh_idx(TX_4X4);
- int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+ (void)bd;
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
+
+ // Workspace for column/row-wise transforms.
+ int32x4_t buf[4];
+
switch (tx_type) {
case DCT_DCT:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case ADST_DCT:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case DCT_ADST:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case ADST_ADST:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case FLIPADST_DCT:
- load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case DCT_FLIPADST:
- load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case FLIPADST_FLIPADST:
- load_buffer_4x4(input, in, input_stride, 1, 1, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case ADST_FLIPADST:
- load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case FLIPADST_ADST:
- load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case IDTX:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case V_DCT:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case H_DCT:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case V_ADST:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case H_ADST:
- load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case V_FLIPADST:
- load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
case H_FLIPADST:
- load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
- idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- transpose_4x4(in, in);
- fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
- write_buffer_4x4(in, coeff);
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
break;
default: assert(0);
}
- (void)bd;
}
-static INLINE void load_buffer_8x8(const int16_t *input, int32x4_t *in,
- int stride, int flipud, int fliplr,
- const int shift) {
- if (!flipud) {
- in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
- in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
- in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
- in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
- in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
- in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
- in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
- in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
- } else {
- in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
- in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
- in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
- in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
- in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
- in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
- in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
- in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
- }
+// Butterfly pre-processing:
+// e.g. n=4:
+// out[0] = in[0] + in[3]
+// out[1] = in[1] + in[2]
+// out[2] = in[1] - in[2]
+// out[3] = in[0] - in[3]
- if (fliplr) {
- in[0] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[0])));
- in[0] = vextq_s32(in[0], in[0], 2);
- in[1] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[1])));
- in[1] = vextq_s32(in[1], in[1], 2);
- in[2] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[2])));
- in[2] = vextq_s32(in[2], in[2], 2);
- in[3] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[3])));
- in[3] = vextq_s32(in[3], in[3], 2);
- in[4] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[4])));
- in[4] = vextq_s32(in[4], in[4], 2);
- in[5] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[5])));
- in[5] = vextq_s32(in[5], in[5], 2);
- in[6] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[6])));
- in[6] = vextq_s32(in[6], in[6], 2);
- in[7] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[7])));
- in[7] = vextq_s32(in[7], in[7], 2);
+static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
+ int32x4_t *output, int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vaddq_s32(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
}
-
- int16x4_t u = vget_high_s16(vreinterpretq_s16_s32(in[4]));
- in[8] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[4])));
- in[9] = vmovl_s16(u);
-
- u = vget_high_s16(vreinterpretq_s16_s32(in[5]));
- in[10] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[5])));
- in[11] = vmovl_s16(u);
-
- u = vget_high_s16(vreinterpretq_s16_s32(in[6]));
- in[12] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[6])));
- in[13] = vmovl_s16(u);
-
- u = vget_high_s16(vreinterpretq_s16_s32(in[7]));
- in[14] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[7])));
- in[15] = vmovl_s16(u);
-
- u = vget_high_s16(vreinterpretq_s16_s32(in[3]));
- in[6] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[3])));
- in[7] = vmovl_s16(u);
-
- u = vget_high_s16(vreinterpretq_s16_s32(in[2]));
- in[4] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[2])));
- in[5] = vmovl_s16(u);
-
- u = vget_high_s16(vreinterpretq_s16_s32(in[1]));
- in[2] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[1])));
- in[3] = vmovl_s16(u);
-
- u = vget_high_s16(vreinterpretq_s16_s32(in[0]));
- in[0] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[0])));
- in[1] = vmovl_s16(u);
-
- const int32x4_t v_shift = vdupq_n_s32(shift);
-
- in[0] = vshlq_s32(in[0], v_shift);
- in[1] = vshlq_s32(in[1], v_shift);
- in[2] = vshlq_s32(in[2], v_shift);
- in[3] = vshlq_s32(in[3], v_shift);
- in[4] = vshlq_s32(in[4], v_shift);
- in[5] = vshlq_s32(in[5], v_shift);
- in[6] = vshlq_s32(in[6], v_shift);
- in[7] = vshlq_s32(in[7], v_shift);
-
- in[8] = vshlq_s32(in[8], v_shift);
- in[9] = vshlq_s32(in[9], v_shift);
- in[10] = vshlq_s32(in[10], v_shift);
- in[11] = vshlq_s32(in[11], v_shift);
- in[12] = vshlq_s32(in[12], v_shift);
- in[13] = vshlq_s32(in[13], v_shift);
- in[14] = vshlq_s32(in[14], v_shift);
- in[15] = vshlq_s32(in[15], v_shift);
}
-static INLINE void col_txfm_8x8_rounding(int32x4_t *in,
- const int32x4_t *v_shift) {
- in[0] = vrshlq_s32(in[0], *v_shift);
- in[1] = vrshlq_s32(in[1], *v_shift);
- in[2] = vrshlq_s32(in[2], *v_shift);
- in[3] = vrshlq_s32(in[3], *v_shift);
- in[4] = vrshlq_s32(in[4], *v_shift);
- in[5] = vrshlq_s32(in[5], *v_shift);
- in[6] = vrshlq_s32(in[6], *v_shift);
- in[7] = vrshlq_s32(in[7], *v_shift);
- in[8] = vrshlq_s32(in[8], *v_shift);
- in[9] = vrshlq_s32(in[9], *v_shift);
- in[10] = vrshlq_s32(in[10], *v_shift);
- in[11] = vrshlq_s32(in[11], *v_shift);
- in[12] = vrshlq_s32(in[12], *v_shift);
- in[13] = vrshlq_s32(in[13], *v_shift);
- in[14] = vrshlq_s32(in[14], *v_shift);
- in[15] = vrshlq_s32(in[15], *v_shift);
+// Butterfly post-processing:
+// e.g. n=8:
+// out[0] = in0[0] + in1[3];
+// out[1] = in0[1] + in1[2];
+// out[2] = in0[1] - in1[2];
+// out[3] = in0[0] - in1[3];
+// out[4] = in0[7] - in1[4];
+// out[5] = in0[6] - in1[5];
+// out[6] = in0[6] + in1[5];
+// out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
+ const int32x4_t *in1,
+ int32x4_t *output, int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
}
-static INLINE void col_txfm_4x8_rounding(int32x4_t *in,
- const int32x4_t *v_shift) {
- in[0] = vrshlq_s32(in[0], *v_shift);
- in[1] = vrshlq_s32(in[1], *v_shift);
- in[2] = vrshlq_s32(in[2], *v_shift);
- in[3] = vrshlq_s32(in[3], *v_shift);
- in[4] = vrshlq_s32(in[4], *v_shift);
- in[5] = vrshlq_s32(in[5], *v_shift);
- in[6] = vrshlq_s32(in[6], *v_shift);
- in[7] = vrshlq_s32(in[7], *v_shift);
-}
+static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
-static INLINE void write_buffer_8x8(const int32x4_t *res, int32_t *output) {
- vst1q_s32(output + 0 * 4, res[0]);
- vst1q_s32(output + 1 * 4, res[1]);
- vst1q_s32(output + 2 * 4, res[2]);
- vst1q_s32(output + 3 * 4, res[3]);
-
- vst1q_s32(output + 4 * 4, res[4]);
- vst1q_s32(output + 5 * 4, res[5]);
- vst1q_s32(output + 6 * 4, res[6]);
- vst1q_s32(output + 7 * 4, res[7]);
-
- vst1q_s32(output + 8 * 4, res[8]);
- vst1q_s32(output + 9 * 4, res[9]);
- vst1q_s32(output + 10 * 4, res[10]);
- vst1q_s32(output + 11 * 4, res[11]);
-
- vst1q_s32(output + 12 * 4, res[12]);
- vst1q_s32(output + 13 * 4, res[13]);
- vst1q_s32(output + 14 * 4, res[14]);
- vst1q_s32(output + 15 * 4, res[15]);
-}
+ // stage 1
+ int32x4_t a[8];
+ butterfly_dct_pre(in, a, 8);
+
+ // stage 2
+ int32x4_t b[8];
+ butterfly_dct_pre(a, b, 4);
+ butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
+
+ // stage 3
+ int32x4_t c[8];
+ butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
+ butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
+ butterfly_dct_post(a + 4, b + 4, c + 4, 4);
-static INLINE void write_buffer_16x8(const int32x4_t *res, int32_t *output,
- const int stride) {
- vst1q_s32(output, res[0]);
- vst1q_s32(output + 4, res[1]);
- vst1q_s32(output + stride, res[2]);
- vst1q_s32(output + stride + 4, res[3]);
-
- vst1q_s32(output + (stride * 2), res[4]);
- vst1q_s32(output + (stride * 2) + 4, res[5]);
- vst1q_s32(output + (stride * 3), res[6]);
- vst1q_s32(output + (stride * 3) + 4, res[7]);
-
- vst1q_s32(output + (stride * 4), res[8]);
- vst1q_s32(output + (stride * 4) + 4, res[9]);
- vst1q_s32(output + (stride * 5), res[10]);
- vst1q_s32(output + (stride * 5) + 4, res[11]);
-
- vst1q_s32(output + (stride * 6), res[12]);
- vst1q_s32(output + (stride * 6) + 4, res[13]);
- vst1q_s32(output + (stride * 7), res[14]);
- vst1q_s32(output + (stride * 7) + 4, res[15]);
+ // stage 4-5
+ butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
+ butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
+
+ out[0] = c[0];
+ out[2] = c[2];
+ out[4] = c[1];
+ out[6] = c[3];
}
-static void fdct4x8_neon(int32x4_t *in, int32x4_t *out, int bit,
- const int col_num) {
- const int32_t *cospi = cospi_arr(bit);
+static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
const int32x4_t v_bit = vdupq_n_s32(-bit);
- int32x4_t u[8], v[8];
- int startidx = 0 * col_num;
- int endidx = 7 * col_num;
+ int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+ int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+
// stage 0-1
- u[0] = vaddq_s32(in[startidx], in[endidx]);
- v[7] = vsubq_s32(in[startidx], in[endidx]);
- startidx += col_num;
- endidx -= col_num;
- u[1] = vaddq_s32(in[startidx], in[endidx]);
- u[6] = vsubq_s32(in[startidx], in[endidx]);
- startidx += col_num;
- endidx -= col_num;
- u[2] = vaddq_s32(in[startidx], in[endidx]);
- u[5] = vsubq_s32(in[startidx], in[endidx]);
- startidx += col_num;
- endidx -= col_num;
- u[3] = vaddq_s32(in[startidx], in[endidx]);
- v[4] = vsubq_s32(in[startidx], in[endidx]);
+ u0 = in[0];
+ u1 = in[7];
+ u2 = in[3];
+ u3 = in[4];
+ u4 = in[1];
+ u5 = in[6];
+ u6 = in[2];
+ u7 = in[5];
// stage 2
- v[0] = vaddq_s32(u[0], u[3]);
- v[3] = vsubq_s32(u[0], u[3]);
- v[1] = vaddq_s32(u[1], u[2]);
- v[2] = vsubq_s32(u[1], u[2]);
-
- v[5] = vmulq_n_s32(u[6], cospi[32]);
- v[5] = vmlsq_n_s32(v[5], u[5], cospi[32]);
- v[5] = vrshlq_s32(v[5], v_bit);
-
- u[0] = vmulq_n_s32(u[5], cospi[32]);
- v[6] = vmlaq_n_s32(u[0], u[6], cospi[32]);
- v[6] = vrshlq_s32(v[6], v_bit);
+ v0 = u0;
+ v1 = u1;
+ butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
+ v4 = u4;
+ v5 = u5;
+ butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
// stage 3
- // type 0
- v[0] = vmulq_n_s32(v[0], cospi[32]);
- v[1] = vmulq_n_s32(v[1], cospi[32]);
- u[0] = vaddq_s32(v[0], v[1]);
- u[0] = vrshlq_s32(u[0], v_bit);
-
- u[1] = vsubq_s32(v[0], v[1]);
- u[1] = vrshlq_s32(u[1], v_bit);
+ u0 = vaddq_s32(v0, v2);
+ u1 = vsubq_s32(v3, v1);
+ u2 = vsubq_s32(v0, v2);
+ u3 = vaddq_s32(v1, v3);
+ u4 = vsubq_s32(v6, v4);
+ u5 = vaddq_s32(v5, v7);
+ u6 = vaddq_s32(v4, v6);
+ u7 = vsubq_s32(v5, v7);
- // type 1
- v[0] = vmulq_n_s32(v[2], cospi[48]);
- u[2] = vmlaq_n_s32(v[0], v[3], cospi[16]);
- u[2] = vrshlq_s32(u[2], v_bit);
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
- v[1] = vmulq_n_s32(v[3], cospi[48]);
- u[3] = vmlsq_n_s32(v[1], v[2], cospi[16]);
- u[3] = vrshlq_s32(u[3], v_bit);
+ butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
+ butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
- u[4] = vaddq_s32(v[4], v[5]);
- u[5] = vsubq_s32(v[4], v[5]);
- u[6] = vsubq_s32(v[7], v[6]);
- u[7] = vaddq_s32(v[7], v[6]);
+ // stage 5
+ u0 = vaddq_s32(v0, v4);
+ u1 = vaddq_s32(v1, v5);
+ u2 = vaddq_s32(v2, v6);
+ u3 = vsubq_s32(v7, v3);
+ u4 = vsubq_s32(v0, v4);
+ u5 = vsubq_s32(v1, v5);
+ u6 = vsubq_s32(v2, v6);
+ u7 = vaddq_s32(v3, v7);
- // stage 4-5
- v[0] = vmulq_n_s32(u[4], cospi[56]);
- v[0] = vmlaq_n_s32(v[0], u[7], cospi[8]);
- out[1 * col_num] = vrshlq_s32(v[0], v_bit);
-
- v[1] = vmulq_n_s32(u[7], cospi[56]);
- v[0] = vmlsq_n_s32(v[1], u[4], cospi[8]);
- out[7 * col_num] = vrshlq_s32(v[0], v_bit);
-
- v[0] = vmulq_n_s32(u[5], cospi[24]);
- v[0] = vmlaq_n_s32(v[0], u[6], cospi[40]);
- out[5 * col_num] = vrshlq_s32(v[0], v_bit);
-
- v[1] = vmulq_n_s32(u[6], cospi[24]);
- v[0] = vmlsq_n_s32(v[1], u[5], cospi[40]);
- out[3 * col_num] = vrshlq_s32(v[0], v_bit);
-
- out[0 * col_num] = u[0];
- out[4 * col_num] = u[1];
- out[2 * col_num] = u[2];
- out[6 * col_num] = u[3];
-}
+ // stage 6
+ butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
+ butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
+ butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
+ butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
-static void fdct8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
- const int col_num) {
- fdct4x8_neon(in, out, bit, col_num);
- fdct4x8_neon(in + 1, out + 1, bit, col_num);
+ // stage 7
+ out[0] = v1;
+ out[1] = v6;
+ out[2] = v3;
+ out[3] = v4;
+ out[4] = v5;
+ out[5] = v2;
+ out[6] = v7;
+ out[7] = v0;
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
+ int32x4_t *out,
+ int bit) {
+ (void)bit;
+ out[0] = vshlq_n_s32(in[0], 1);
+ out[1] = vshlq_n_s32(in[1], 1);
+ out[2] = vshlq_n_s32(in[2], 1);
+ out[3] = vshlq_n_s32(in[3], 1);
+ out[4] = vshlq_n_s32(in[4], 1);
+ out[5] = vshlq_n_s32(in[5], 1);
+ out[6] = vshlq_n_s32(in[6], 1);
+ out[7] = vshlq_n_s32(in[7], 1);
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
}
-static void fadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
- const int col_num) {
- const int32_t *cospi = cospi_arr(bit);
-
- const int32x4_t v_bit = vdupq_n_s32(-bit);
- int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
- int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
- int32x4_t x, y;
- int col;
-
- for (col = 0; col < col_num; ++col) {
- // stage 0-1
- u0 = in[col_num * 0 + col];
- u1 = vnegq_s32(in[col_num * 7 + col]);
- u2 = vnegq_s32(in[col_num * 3 + col]);
- u3 = in[col_num * 4 + col];
- u4 = vnegq_s32(in[col_num * 1 + col]);
- u5 = in[col_num * 6 + col];
- u6 = in[col_num * 2 + col];
- u7 = vnegq_s32(in[col_num * 5 + col]);
-
- // stage 2
- v0 = u0;
- v1 = u1;
-
- x = vmulq_n_s32(u2, cospi[32]);
- y = vmulq_n_s32(u3, cospi[32]);
- v2 = vaddq_s32(x, y);
- v2 = vrshlq_s32(v2, v_bit);
-
- v3 = vsubq_s32(x, y);
- v3 = vrshlq_s32(v3, v_bit);
-
- v4 = u4;
- v5 = u5;
-
- x = vmulq_n_s32(u6, cospi[32]);
- y = vmulq_n_s32(u7, cospi[32]);
- v6 = vaddq_s32(x, y);
- v6 = vrshlq_s32(v6, v_bit);
-
- v7 = vsubq_s32(x, y);
- v7 = vrshlq_s32(v7, v_bit);
-
- // stage 3
- u0 = vaddq_s32(v0, v2);
- u1 = vaddq_s32(v1, v3);
- u2 = vsubq_s32(v0, v2);
- u3 = vsubq_s32(v1, v3);
- u4 = vaddq_s32(v4, v6);
- u5 = vaddq_s32(v5, v7);
- u6 = vsubq_s32(v4, v6);
- u7 = vsubq_s32(v5, v7);
-
- // stage 4
- v0 = u0;
- v1 = u1;
- v2 = u2;
- v3 = u3;
-
- v4 = vmulq_n_s32(u4, cospi[16]);
- v4 = vmlaq_n_s32(v4, u5, cospi[48]);
- v4 = vrshlq_s32(v4, v_bit);
-
- v5 = vmulq_n_s32(u4, cospi[48]);
- v5 = vmlsq_n_s32(v5, u5, cospi[16]);
- v5 = vrshlq_s32(v5, v_bit);
-
- v6 = vmulq_n_s32(u7, cospi[16]);
- v6 = vmlsq_n_s32(v6, u6, cospi[48]);
- v6 = vrshlq_s32(v6, v_bit);
-
- v7 = vmulq_n_s32(u6, cospi[16]);
- v7 = vmlaq_n_s32(v7, u7, cospi[48]);
- v7 = vrshlq_s32(v7, v_bit);
-
- // stage 5
- u0 = vaddq_s32(v0, v4);
- u1 = vaddq_s32(v1, v5);
- u2 = vaddq_s32(v2, v6);
- u3 = vaddq_s32(v3, v7);
- u4 = vsubq_s32(v0, v4);
- u5 = vsubq_s32(v1, v5);
- u6 = vsubq_s32(v2, v6);
- u7 = vsubq_s32(v3, v7);
-
- // stage 6
- v0 = vmulq_n_s32(u0, cospi[4]);
- v0 = vmlaq_n_s32(v0, u1, cospi[60]);
- v0 = vrshlq_s32(v0, v_bit);
-
- v1 = vmulq_n_s32(u0, cospi[60]);
- v1 = vmlsq_n_s32(v1, u1, cospi[4]);
- v1 = vrshlq_s32(v1, v_bit);
-
- v2 = vmulq_n_s32(u2, cospi[20]);
- v2 = vmlaq_n_s32(v2, u3, cospi[44]);
- v2 = vrshlq_s32(v2, v_bit);
-
- v3 = vmulq_n_s32(u2, cospi[44]);
- v3 = vmlsq_n_s32(v3, u3, cospi[20]);
- v3 = vrshlq_s32(v3, v_bit);
-
- v4 = vmulq_n_s32(u4, cospi[36]);
- v4 = vmlaq_n_s32(v4, u5, cospi[28]);
- v4 = vrshlq_s32(v4, v_bit);
-
- v5 = vmulq_n_s32(u4, cospi[28]);
- v5 = vmlsq_n_s32(v5, u5, cospi[36]);
- v5 = vrshlq_s32(v5, v_bit);
-
- x = vmulq_n_s32(u6, cospi[52]);
- v6 = vmlaq_n_s32(x, u7, cospi[12]);
- v6 = vrshlq_s32(v6, v_bit);
-
- v7 = vmulq_n_s32(u6, cospi[12]);
- v7 = vmlsq_n_s32(v7, u7, cospi[52]);
- v7 = vrshlq_s32(v7, v_bit);
-
- // stage 7
- out[col_num * 0 + col] = v1;
- out[col_num * 1 + col] = v6;
- out[col_num * 2 + col] = v3;
- out[col_num * 3 + col] = v4;
- out[col_num * 4 + col] = v5;
- out[col_num * 5 + col] = v2;
- out[col_num * 6 + col] = v7;
- out[col_num * 7 + col] = v0;
- }
+static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
}
-static void idtx8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
- (void)bit;
- for (int i = 0; i < col_num; i += 1) {
- out[0 + 8 * i] = vshlq_n_s32(in[0 + 8 * i], 1);
- out[1 + 8 * i] = vshlq_n_s32(in[1 + 8 * i], 1);
- out[2 + 8 * i] = vshlq_n_s32(in[2 + 8 * i], 1);
- out[3 + 8 * i] = vshlq_n_s32(in[3 + 8 * i], 1);
- out[4 + 8 * i] = vshlq_n_s32(in[4 + 8 * i], 1);
- out[5 + 8 * i] = vshlq_n_s32(in[5 + 8 * i], 1);
- out[6 + 8 * i] = vshlq_n_s32(in[6 + 8 * i], 1);
- out[7 + 8 * i] = vshlq_n_s32(in[7 + 8 * i], 1);
- }
-}
-#if !CONFIG_REALTIME_ONLY
-static void idtx32x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
(void)bit;
- (void)col_num;
- for (int j = 0; j < 2; j++) {
- out[j + 8 * 0] = vshlq_n_s32(in[j + 8 * 0], 1);
- out[j + 8 * 1] = vshlq_n_s32(in[j + 8 * 1], 1);
- out[j + 8 * 2] = vshlq_n_s32(in[j + 8 * 2], 1);
- out[j + 8 * 3] = vshlq_n_s32(in[j + 8 * 3], 1);
- out[j + 8 * 4] = vshlq_n_s32(in[j + 8 * 4], 1);
- out[j + 8 * 5] = vshlq_n_s32(in[j + 8 * 5], 1);
- out[j + 8 * 6] = vshlq_n_s32(in[j + 8 * 6], 1);
- out[j + 8 * 7] = vshlq_n_s32(in[j + 8 * 7], 1);
- }
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
}
-#endif
+
void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[16], out[16];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
- const int txw_idx = get_txw_idx(TX_8X8);
- const int txh_idx = get_txh_idx(TX_8X8);
- const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+ (void)bd;
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Workspaces for column/row-wise transforms.
+ int32x4_t buf0[16], buf1[16];
+
switch (tx_type) {
case DCT_DCT:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case ADST_DCT:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case DCT_ADST:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case ADST_ADST:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case FLIPADST_DCT:
- load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case DCT_FLIPADST:
- load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case FLIPADST_FLIPADST:
- load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case ADST_FLIPADST:
- load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case FLIPADST_ADST:
- load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case IDTX:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case V_DCT:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case H_DCT:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case V_ADST:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case H_ADST:
- load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case V_FLIPADST:
- load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
case H_FLIPADST:
- load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
- idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- col_txfm_8x8_rounding(out, &v_shift1);
- transpose_8x8(out, in);
- fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
- write_buffer_8x8(out, coeff);
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
break;
default: assert(0);
}
- (void)bd;
-}
-
-// Hybrid Transform 16x16
-
-static INLINE void convert_8x8_to_16x16(const int32x4_t *in, int32x4_t *out) {
- int row_index = 0;
- int dst_index = 0;
- int src_index = 0;
-
- // row 0, 1, .., 7
- do {
- out[dst_index] = in[src_index];
- out[dst_index + 1] = in[src_index + 1];
- out[dst_index + 2] = in[src_index + 16];
- out[dst_index + 3] = in[src_index + 17];
- dst_index += 4;
- src_index += 2;
- row_index += 1;
- } while (row_index < 8);
-
- // row 8, 9, ..., 15
- src_index += 16;
- do {
- out[dst_index] = in[src_index];
- out[dst_index + 1] = in[src_index + 1];
- out[dst_index + 2] = in[src_index + 16];
- out[dst_index + 3] = in[src_index + 17];
- dst_index += 4;
- src_index += 2;
- row_index += 1;
- } while (row_index < 16);
}
-static INLINE void load_buffer_16x16(const int16_t *input, int32x4_t *out,
- int stride, int flipud, int fliplr,
- int shift) {
- int32x4_t in[64];
- // Load 4 8x8 blocks
- const int16_t *topL = input;
- const int16_t *topR = input + 8;
- const int16_t *botL = input + 8 * stride;
- const int16_t *botR = input + 8 * stride + 8;
-
- const int16_t *tmp;
-
- if (flipud) {
- // Swap left columns
- tmp = topL;
- topL = botL;
- botL = tmp;
- // Swap right columns
- tmp = topR;
- topR = botR;
- botR = tmp;
- }
-
- if (fliplr) {
- // Swap top rows
- tmp = topL;
- topL = topR;
- topR = tmp;
- // Swap bottom rows
- tmp = botL;
- botL = botR;
- botR = tmp;
- }
-
- // load first 8 columns
- load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
- load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
- // load second 8 columns
- load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
- load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+ int32x4_t u[16], v[16];
- convert_8x8_to_16x16(in, out);
-}
+ // stage 1
+ butterfly_dct_pre(in, u, 16);
-static INLINE void load_buffer_8x16(const int16_t *input, int32x4_t *out,
- int stride, int flipud, int fliplr,
- int shift) {
- const int16_t *topL = input;
- const int16_t *botL = input + 8 * stride;
+ // stage 2
+ butterfly_dct_pre(u, v, 8);
+ v[8] = u[8];
+ v[9] = u[9];
+ butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
+ butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
+ v[14] = u[14];
+ v[15] = u[15];
- const int16_t *tmp;
+ // stage 3
+ butterfly_dct_pre(v, u, 4);
+ u[4] = v[4];
+ butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
+ u[7] = v[7];
+ butterfly_dct_post(v + 8, v + 8, u + 8, 8);
- if (flipud) {
- tmp = topL;
- topL = botL;
- botL = tmp;
- }
+ // stage 4
+ butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
+ butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
+ butterfly_dct_post(u + 4, u + 4, v + 4, 4);
+ v[8] = u[8];
+ butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
+ butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
- load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
- load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
-}
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
+ butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
+ butterfly_dct_post(v + 8, v + 8, u + 8, 4);
+ butterfly_dct_post(v + 12, v + 12, u + 12, 4);
-static INLINE void load_buffer_8x4(const int16_t *input, int32x4_t *out,
- int stride, int flipud, int fliplr,
- const int32x4_t *v_shift) {
- const int16_t *topL = input;
- const int16_t *topR = input + 4;
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
+ butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
+ butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
+ butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
+
+ out[0] = v[0];
+ out[1] = v[8];
+ out[2] = v[4];
+ out[3] = v[12];
+ out[4] = v[2];
+ out[5] = v[10];
+ out[6] = v[6];
+ out[7] = v[14];
+ out[8] = v[1];
+ out[9] = v[9];
+ out[10] = v[5];
+ out[11] = v[13];
+ out[12] = v[3];
+ out[13] = v[11];
+ out[14] = v[7];
+ out[15] = v[15];
+}
+
+static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
- const int16_t *tmp;
+ int32x4_t u[16], v[16];
- if (fliplr) {
- tmp = topL;
- topL = topR;
- topR = tmp;
- }
- load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
- load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, v_shift);
-}
+ // stage 0-1
+ u[0] = in[0];
+ u[1] = in[15];
+ u[2] = in[7];
+ u[3] = in[8];
+ u[4] = in[3];
+ u[5] = in[12];
+ u[6] = in[4];
+ u[7] = in[11];
+ u[8] = in[1];
+ u[9] = in[14];
+ u[10] = in[6];
+ u[11] = in[9];
+ u[12] = in[2];
+ u[13] = in[13];
+ u[14] = in[5];
+ u[15] = in[10];
-static INLINE void load_buffer_16x4(const int16_t *input, int32x4_t *out,
- int stride, int flipud, int fliplr,
- const int32x4_t *v_shift) {
- const int16_t *topL = input;
- const int16_t *topR = input + 8;
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
+ v[4] = u[4];
+ v[5] = u[5];
+ butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
+ v[12] = u[12];
+ v[13] = u[13];
+ butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
- const int16_t *tmp;
+ // stage 3
+ u[0] = vaddq_s32(v[0], v[2]);
+ u[1] = vsubq_s32(v[3], v[1]);
+ u[2] = vsubq_s32(v[0], v[2]);
+ u[3] = vaddq_s32(v[1], v[3]);
+ u[4] = vsubq_s32(v[6], v[4]);
+ u[5] = vaddq_s32(v[5], v[7]);
+ u[6] = vaddq_s32(v[4], v[6]);
+ u[7] = vsubq_s32(v[5], v[7]);
+ u[8] = vsubq_s32(v[10], v[8]);
+ u[9] = vaddq_s32(v[9], v[11]);
+ u[10] = vaddq_s32(v[8], v[10]);
+ u[11] = vsubq_s32(v[9], v[11]);
+ u[12] = vaddq_s32(v[12], v[14]);
+ u[13] = vsubq_s32(v[15], v[13]);
+ u[14] = vsubq_s32(v[12], v[14]);
+ u[15] = vaddq_s32(v[13], v[15]);
- if (fliplr) {
- tmp = topL;
- topL = topR;
- topR = tmp;
- }
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
+ butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
- load_buffer_8x4(topL, out, stride, flipud, fliplr, v_shift);
- load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, v_shift);
-}
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
-static INLINE void load_buffer_4x8(const int16_t *input, int32x4_t *out,
- int stride, int flipud, int fliplr,
- const int32x4_t *v_shift) {
- const int16_t *topL = input;
- const int16_t *botL = input + 4 * stride;
+ butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
+ butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
- const int16_t *tmp;
+ // stage 5
+ u[0] = vaddq_s32(v[0], v[4]);
+ u[1] = vaddq_s32(v[1], v[5]);
+ u[2] = vaddq_s32(v[2], v[6]);
+ u[3] = vsubq_s32(v[7], v[3]);
+ u[4] = vsubq_s32(v[0], v[4]);
+ u[5] = vsubq_s32(v[1], v[5]);
+ u[6] = vsubq_s32(v[2], v[6]);
+ u[7] = vaddq_s32(v[3], v[7]);
+ u[8] = vaddq_s32(v[8], v[12]);
+ u[9] = vaddq_s32(v[9], v[13]);
+ u[10] = vsubq_s32(v[14], v[10]);
+ u[11] = vaddq_s32(v[11], v[15]);
+ u[12] = vsubq_s32(v[8], v[12]);
+ u[13] = vsubq_s32(v[9], v[13]);
+ u[14] = vaddq_s32(v[10], v[14]);
+ u[15] = vsubq_s32(v[11], v[15]);
- if (flipud) {
- tmp = topL;
- topL = botL;
- botL = tmp;
- }
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
+ butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
+ butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
+ butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
- load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
- load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, v_shift);
-}
+ // stage 7
+ u[0] = vaddq_s32(v[0], v[8]);
+ u[1] = vaddq_s32(v[1], v[9]);
+ u[2] = vaddq_s32(v[2], v[10]);
+ u[3] = vaddq_s32(v[3], v[11]);
+ u[4] = vaddq_s32(v[4], v[12]);
+ u[5] = vaddq_s32(v[5], v[13]);
+ u[6] = vaddq_s32(v[6], v[14]);
+ u[7] = vsubq_s32(v[15], v[7]);
+ u[8] = vsubq_s32(v[0], v[8]);
+ u[9] = vsubq_s32(v[1], v[9]);
+ u[10] = vsubq_s32(v[2], v[10]);
+ u[11] = vsubq_s32(v[3], v[11]);
+ u[12] = vsubq_s32(v[4], v[12]);
+ u[13] = vsubq_s32(v[5], v[13]);
+ u[14] = vsubq_s32(v[6], v[14]);
+ u[15] = vaddq_s32(v[7], v[15]);
-#if !CONFIG_REALTIME_ONLY
-static INLINE void load_buffer_4x16(const int16_t *input, int32x4_t *out,
- const int stride, const int flipud,
- const int fliplr,
- const int32x4_t *v_shift) {
- const int16_t *topL = input;
- const int16_t *botL = input + 8 * stride;
-
- const int16_t *tmp;
-
- if (flipud) {
- tmp = topL;
- topL = botL;
- botL = tmp;
- }
- load_buffer_4x8(topL, out, stride, flipud, fliplr, v_shift);
- load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, v_shift);
-}
-#endif
+ // stage 8
+ butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
+ butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
+ butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
+ butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
+ butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
+ butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
+ butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
+ butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
-static INLINE void load_buffer_32x8n(const int16_t *input, int32x4_t *out,
- int stride, int flipud, int fliplr,
- int shift, const int height) {
- const int16_t *in = input;
- int32x4_t *output = out;
- for (int col = 0; col < height; col++) {
- in = input + col * stride;
- output = out + col * 8;
- int32x4_t v_shift = vdupq_n_s32(shift);
- load_buffer_4x4(in, output, 4, flipud, fliplr, &v_shift);
- load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, &v_shift);
- }
-}
+ // stage 9
+ out[0] = v[1];
+ out[1] = v[14];
+ out[2] = v[3];
+ out[3] = v[12];
+ out[4] = v[5];
+ out[5] = v[10];
+ out[6] = v[7];
+ out[7] = v[8];
+ out[8] = v[9];
+ out[9] = v[6];
+ out[10] = v[11];
+ out[11] = v[4];
+ out[12] = v[13];
+ out[13] = v[2];
+ out[14] = v[15];
+ out[15] = v[0];
+}
+
+static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ (void)bit;
+ const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
+ const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
-static void fdct16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
- const int col_num) {
- const int32_t *cospi = cospi_arr(bit);
- const int32x4_t v_bit = vdupq_n_s32(-bit);
- int32x4_t u[16], v[16];
- int col;
-
- // Calculate the column 0, 1, 2, 3
- for (col = 0; col < col_num; ++col) {
- // stage 0
- // stage 1
- u[0] = vaddq_s32(in[0 * col_num + col], in[15 * col_num + col]);
- u[15] = vsubq_s32(in[0 * col_num + col], in[15 * col_num + col]);
- u[1] = vaddq_s32(in[1 * col_num + col], in[14 * col_num + col]);
- u[14] = vsubq_s32(in[1 * col_num + col], in[14 * col_num + col]);
- u[2] = vaddq_s32(in[2 * col_num + col], in[13 * col_num + col]);
- u[13] = vsubq_s32(in[2 * col_num + col], in[13 * col_num + col]);
- u[3] = vaddq_s32(in[3 * col_num + col], in[12 * col_num + col]);
- u[12] = vsubq_s32(in[3 * col_num + col], in[12 * col_num + col]);
- u[4] = vaddq_s32(in[4 * col_num + col], in[11 * col_num + col]);
- u[11] = vsubq_s32(in[4 * col_num + col], in[11 * col_num + col]);
- u[5] = vaddq_s32(in[5 * col_num + col], in[10 * col_num + col]);
- u[10] = vsubq_s32(in[5 * col_num + col], in[10 * col_num + col]);
- u[6] = vaddq_s32(in[6 * col_num + col], in[9 * col_num + col]);
- u[9] = vsubq_s32(in[6 * col_num + col], in[9 * col_num + col]);
- u[7] = vaddq_s32(in[7 * col_num + col], in[8 * col_num + col]);
- u[8] = vsubq_s32(in[7 * col_num + col], in[8 * col_num + col]);
-
- // stage 2
- v[0] = vaddq_s32(u[0], u[7]);
- v[7] = vsubq_s32(u[0], u[7]);
- v[1] = vaddq_s32(u[1], u[6]);
- v[6] = vsubq_s32(u[1], u[6]);
- v[2] = vaddq_s32(u[2], u[5]);
- v[5] = vsubq_s32(u[2], u[5]);
- v[3] = vaddq_s32(u[3], u[4]);
- v[4] = vsubq_s32(u[3], u[4]);
- v[8] = u[8];
- v[9] = u[9];
-
- v[10] = vmulq_n_s32(u[13], cospi[32]);
- v[10] = vmlsq_n_s32(v[10], u[10], cospi[32]);
- v[10] = vrshlq_s32(v[10], v_bit);
-
- v[13] = vmulq_n_s32(u[10], cospi[32]);
- v[13] = vmlaq_n_s32(v[13], u[13], cospi[32]);
- v[13] = vrshlq_s32(v[13], v_bit);
-
- v[11] = vmulq_n_s32(u[12], cospi[32]);
- v[11] = vmlsq_n_s32(v[11], u[11], cospi[32]);
- v[11] = vrshlq_s32(v[11], v_bit);
-
- v[12] = vmulq_n_s32(u[11], cospi[32]);
- v[12] = vmlaq_n_s32(v[12], u[12], cospi[32]);
- v[12] = vrshlq_s32(v[12], v_bit);
- v[14] = u[14];
- v[15] = u[15];
-
- // stage 3
- u[0] = vaddq_s32(v[0], v[3]);
- u[3] = vsubq_s32(v[0], v[3]);
- u[1] = vaddq_s32(v[1], v[2]);
- u[2] = vsubq_s32(v[1], v[2]);
- u[4] = v[4];
-
- u[5] = vmulq_n_s32(v[6], cospi[32]);
- u[5] = vmlsq_n_s32(u[5], v[5], cospi[32]);
- u[5] = vrshlq_s32(u[5], v_bit);
-
- u[6] = vmulq_n_s32(v[5], cospi[32]);
- u[6] = vmlaq_n_s32(u[6], v[6], cospi[32]);
- u[6] = vrshlq_s32(u[6], v_bit);
-
- u[7] = v[7];
- u[8] = vaddq_s32(v[8], v[11]);
- u[11] = vsubq_s32(v[8], v[11]);
- u[9] = vaddq_s32(v[9], v[10]);
- u[10] = vsubq_s32(v[9], v[10]);
- u[12] = vsubq_s32(v[15], v[12]);
- u[15] = vaddq_s32(v[15], v[12]);
- u[13] = vsubq_s32(v[14], v[13]);
- u[14] = vaddq_s32(v[14], v[13]);
-
- // stage 4
- u[0] = vmulq_n_s32(u[0], cospi[32]);
- u[1] = vmulq_n_s32(u[1], cospi[32]);
- v[0] = vaddq_s32(u[0], u[1]);
- v[0] = vrshlq_s32(v[0], v_bit);
-
- v[1] = vsubq_s32(u[0], u[1]);
- v[1] = vrshlq_s32(v[1], v_bit);
-
- v[2] = vmulq_n_s32(u[2], cospi[48]);
- v[2] = vmlaq_n_s32(v[2], u[3], cospi[16]);
- v[2] = vrshlq_s32(v[2], v_bit);
-
- v[3] = vmulq_n_s32(u[3], cospi[48]);
- v[3] = vmlsq_n_s32(v[3], u[2], cospi[16]);
- v[3] = vrshlq_s32(v[3], v_bit);
-
- v[4] = vaddq_s32(u[4], u[5]);
- v[5] = vsubq_s32(u[4], u[5]);
- v[6] = vsubq_s32(u[7], u[6]);
- v[7] = vaddq_s32(u[7], u[6]);
- v[8] = u[8];
-
- v[9] = vmulq_n_s32(u[14], cospi[48]);
- v[9] = vmlsq_n_s32(v[9], u[9], cospi[16]);
- v[9] = vrshlq_s32(v[9], v_bit);
-
- v[14] = vmulq_n_s32(u[9], cospi[48]);
- v[14] = vmlaq_n_s32(v[14], u[14], cospi[16]);
- v[14] = vrshlq_s32(v[14], v_bit);
-
- v[10] = vmulq_n_s32(u[13], -cospi[16]);
- v[10] = vmlsq_n_s32(v[10], u[10], cospi[48]);
- v[10] = vrshlq_s32(v[10], v_bit);
-
- v[13] = vmulq_n_s32(u[10], -cospi[16]);
- v[13] = vmlaq_n_s32(v[13], u[13], cospi[48]);
- v[13] = vrshlq_s32(v[13], v_bit);
-
- v[11] = u[11];
- v[12] = u[12];
- v[15] = u[15];
-
- // stage 5
- u[0] = v[0];
- u[1] = v[1];
- u[2] = v[2];
- u[3] = v[3];
-
- u[4] = vmulq_n_s32(v[4], cospi[56]);
- u[4] = vmlaq_n_s32(u[4], v[7], cospi[8]);
- u[4] = vrshlq_s32(u[4], v_bit);
-
- u[7] = vmulq_n_s32(v[7], cospi[56]);
- u[7] = vmlsq_n_s32(u[7], v[4], cospi[8]);
- u[7] = vrshlq_s32(u[7], v_bit);
-
- u[5] = vmulq_n_s32(v[5], cospi[24]);
- u[5] = vmlaq_n_s32(u[5], v[6], cospi[40]);
- u[5] = vrshlq_s32(u[5], v_bit);
-
- u[6] = vmulq_n_s32(v[6], cospi[24]);
- u[6] = vmlsq_n_s32(u[6], v[5], cospi[40]);
- u[6] = vrshlq_s32(u[6], v_bit);
-
- u[8] = vaddq_s32(v[8], v[9]);
- u[9] = vsubq_s32(v[8], v[9]);
- u[10] = vsubq_s32(v[11], v[10]);
- u[11] = vaddq_s32(v[11], v[10]);
- u[12] = vaddq_s32(v[12], v[13]);
- u[13] = vsubq_s32(v[12], v[13]);
- u[14] = vsubq_s32(v[15], v[14]);
- u[15] = vaddq_s32(v[15], v[14]);
-
- // stage 6
- v[0] = u[0];
- v[1] = u[1];
- v[2] = u[2];
- v[3] = u[3];
- v[4] = u[4];
- v[5] = u[5];
- v[6] = u[6];
- v[7] = u[7];
-
- v[8] = vmulq_n_s32(u[8], cospi[60]);
- v[8] = vmlaq_n_s32(v[8], u[15], cospi[4]);
- v[8] = vrshlq_s32(v[8], v_bit);
-
- v[15] = vmulq_n_s32(u[15], cospi[60]);
- v[15] = vmlsq_n_s32(v[15], u[8], cospi[4]);
- v[15] = vrshlq_s32(v[15], v_bit);
-
- v[9] = vmulq_n_s32(u[9], cospi[28]);
- v[9] = vmlaq_n_s32(v[9], u[14], cospi[36]);
- v[9] = vrshlq_s32(v[9], v_bit);
-
- v[14] = vmulq_n_s32(u[14], cospi[28]);
- v[14] = vmlsq_n_s32(v[14], u[9], cospi[36]);
- v[14] = vrshlq_s32(v[14], v_bit);
-
- v[10] = vmulq_n_s32(u[10], cospi[44]);
- v[10] = vmlaq_n_s32(v[10], u[13], cospi[20]);
- v[10] = vrshlq_s32(v[10], v_bit);
-
- v[13] = vmulq_n_s32(u[13], cospi[44]);
- v[13] = vmlsq_n_s32(v[13], u[10], cospi[20]);
- v[13] = vrshlq_s32(v[13], v_bit);
-
- v[11] = vmulq_n_s32(u[11], cospi[12]);
- v[11] = vmlaq_n_s32(v[11], u[12], cospi[52]);
- v[11] = vrshlq_s32(v[11], v_bit);
-
- v[12] = vmulq_n_s32(u[12], cospi[12]);
- v[12] = vmlsq_n_s32(v[12], u[11], cospi[52]);
- v[12] = vrshlq_s32(v[12], v_bit);
-
- out[0 * col_num + col] = v[0];
- out[1 * col_num + col] = v[8];
- out[2 * col_num + col] = v[4];
- out[3 * col_num + col] = v[12];
- out[4 * col_num + col] = v[2];
- out[5 * col_num + col] = v[10];
- out[6 * col_num + col] = v[6];
- out[7 * col_num + col] = v[14];
- out[8 * col_num + col] = v[1];
- out[9 * col_num + col] = v[9];
- out[10 * col_num + col] = v[5];
- out[11 * col_num + col] = v[13];
- out[12 * col_num + col] = v[3];
- out[13 * col_num + col] = v[11];
- out[14 * col_num + col] = v[7];
- out[15 * col_num + col] = v[15];
+ for (int i = 0; i < 16; i++) {
+ int32x4_t a = vmulq_s32(in[i], fact);
+ a = vaddq_s32(a, offset);
+ out[i] = vshrq_n_s32(a, NewSqrt2Bits);
}
}
-static void fadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
- const int num_cols) {
- const int32_t *cospi = cospi_arr(bit);
-
- const int32x4_t v_bit = vdupq_n_s32(-bit);
-
- int32x4_t u[16], v[16], x, y;
- int col;
-
- for (col = 0; col < num_cols; ++col) {
- // stage 0-1
- u[0] = in[0 * num_cols + col];
- u[1] = vnegq_s32(in[15 * num_cols + col]);
- u[2] = vnegq_s32(in[7 * num_cols + col]);
- u[3] = in[8 * num_cols + col];
- u[4] = vnegq_s32(in[3 * num_cols + col]);
- u[5] = in[12 * num_cols + col];
- u[6] = in[4 * num_cols + col];
- u[7] = vnegq_s32(in[11 * num_cols + col]);
- u[8] = vnegq_s32(in[1 * num_cols + col]);
- u[9] = in[14 * num_cols + col];
- u[10] = in[6 * num_cols + col];
- u[11] = vnegq_s32(in[9 * num_cols + col]);
- u[12] = in[2 * num_cols + col];
- u[13] = vnegq_s32(in[13 * num_cols + col]);
- u[14] = vnegq_s32(in[5 * num_cols + col]);
- u[15] = in[10 * num_cols + col];
-
- // stage 2
- v[0] = u[0];
- v[1] = u[1];
-
- x = vmulq_n_s32(u[2], cospi[32]);
- y = vmulq_n_s32(u[3], cospi[32]);
- v[2] = vaddq_s32(x, y);
- v[2] = vrshlq_s32(v[2], v_bit);
-
- v[3] = vsubq_s32(x, y);
- v[3] = vrshlq_s32(v[3], v_bit);
-
- v[4] = u[4];
- v[5] = u[5];
-
- x = vmulq_n_s32(u[6], cospi[32]);
- y = vmulq_n_s32(u[7], cospi[32]);
- v[6] = vaddq_s32(x, y);
- v[6] = vrshlq_s32(v[6], v_bit);
-
- v[7] = vsubq_s32(x, y);
- v[7] = vrshlq_s32(v[7], v_bit);
-
- v[8] = u[8];
- v[9] = u[9];
-
- x = vmulq_n_s32(u[10], cospi[32]);
- y = vmulq_n_s32(u[11], cospi[32]);
- v[10] = vaddq_s32(x, y);
- v[10] = vrshlq_s32(v[10], v_bit);
-
- v[11] = vsubq_s32(x, y);
- v[11] = vrshlq_s32(v[11], v_bit);
-
- v[12] = u[12];
- v[13] = u[13];
-
- x = vmulq_n_s32(u[14], cospi[32]);
- y = vmulq_n_s32(u[15], cospi[32]);
- v[14] = vaddq_s32(x, y);
- v[14] = vrshlq_s32(v[14], v_bit);
-
- v[15] = vsubq_s32(x, y);
- v[15] = vrshlq_s32(v[15], v_bit);
-
- // stage 3
- u[0] = vaddq_s32(v[0], v[2]);
- u[1] = vaddq_s32(v[1], v[3]);
- u[2] = vsubq_s32(v[0], v[2]);
- u[3] = vsubq_s32(v[1], v[3]);
- u[4] = vaddq_s32(v[4], v[6]);
- u[5] = vaddq_s32(v[5], v[7]);
- u[6] = vsubq_s32(v[4], v[6]);
- u[7] = vsubq_s32(v[5], v[7]);
- u[8] = vaddq_s32(v[8], v[10]);
- u[9] = vaddq_s32(v[9], v[11]);
- u[10] = vsubq_s32(v[8], v[10]);
- u[11] = vsubq_s32(v[9], v[11]);
- u[12] = vaddq_s32(v[12], v[14]);
- u[13] = vaddq_s32(v[13], v[15]);
- u[14] = vsubq_s32(v[12], v[14]);
- u[15] = vsubq_s32(v[13], v[15]);
-
- // stage 4
- v[0] = u[0];
- v[1] = u[1];
- v[2] = u[2];
- v[3] = u[3];
- v[4] = half_btf_neon(&cospi[16], &u[4], &cospi[48], &u[5], v_bit);
- v[7] = half_btf_neon(&cospi[16], &u[6], &cospi[48], &u[7], v_bit);
- v[5] = half_btf_neon_m(&cospi[48], &u[4], &cospi[16], &u[5], v_bit);
- v[6] = half_btf_neon_m(&cospi[16], &u[7], &cospi[48], &u[6], v_bit);
-
- v[8] = u[8];
- v[9] = u[9];
- v[10] = u[10];
- v[11] = u[11];
-
- v[12] = half_btf_neon(&cospi[16], &u[12], &cospi[48], &u[13], v_bit);
- v[15] = half_btf_neon(&cospi[16], &u[14], &cospi[48], &u[15], v_bit);
- v[13] = half_btf_neon_m(&cospi[48], &u[12], &cospi[16], &u[13], v_bit);
- v[14] = half_btf_neon_m(&cospi[16], &u[15], &cospi[48], &u[14], v_bit);
-
- // stage 5
- u[0] = vaddq_s32(v[0], v[4]);
- u[1] = vaddq_s32(v[1], v[5]);
- u[2] = vaddq_s32(v[2], v[6]);
- u[3] = vaddq_s32(v[3], v[7]);
- u[4] = vsubq_s32(v[0], v[4]);
- u[5] = vsubq_s32(v[1], v[5]);
- u[6] = vsubq_s32(v[2], v[6]);
- u[7] = vsubq_s32(v[3], v[7]);
- u[8] = vaddq_s32(v[8], v[12]);
- u[9] = vaddq_s32(v[9], v[13]);
- u[10] = vaddq_s32(v[10], v[14]);
- u[11] = vaddq_s32(v[11], v[15]);
- u[12] = vsubq_s32(v[8], v[12]);
- u[13] = vsubq_s32(v[9], v[13]);
- u[14] = vsubq_s32(v[10], v[14]);
- u[15] = vsubq_s32(v[11], v[15]);
-
- // stage 6
- v[0] = u[0];
- v[1] = u[1];
- v[2] = u[2];
- v[3] = u[3];
- v[4] = u[4];
- v[5] = u[5];
- v[6] = u[6];
- v[7] = u[7];
-
- v[8] = half_btf_neon(&cospi[8], &u[8], &cospi[56], &u[9], v_bit);
- v[13] = half_btf_neon(&cospi[8], &u[12], &cospi[56], &u[13], v_bit);
- v[9] = half_btf_neon_m(&cospi[56], &u[8], &cospi[8], &u[9], v_bit);
- v[12] = half_btf_neon_m(&cospi[8], &u[13], &cospi[56], &u[12], v_bit);
-
- v[10] = half_btf_neon(&cospi[40], &u[10], &cospi[24], &u[11], v_bit);
- v[15] = half_btf_neon(&cospi[40], &u[14], &cospi[24], &u[15], v_bit);
- v[11] = half_btf_neon_m(&cospi[24], &u[10], &cospi[40], &u[11], v_bit);
- v[14] = half_btf_neon_m(&cospi[40], &u[15], &cospi[24], &u[14], v_bit);
-
- // stage 7
- u[0] = vaddq_s32(v[0], v[8]);
- u[1] = vaddq_s32(v[1], v[9]);
- u[2] = vaddq_s32(v[2], v[10]);
- u[3] = vaddq_s32(v[3], v[11]);
- u[4] = vaddq_s32(v[4], v[12]);
- u[5] = vaddq_s32(v[5], v[13]);
- u[6] = vaddq_s32(v[6], v[14]);
- u[7] = vaddq_s32(v[7], v[15]);
- u[8] = vsubq_s32(v[0], v[8]);
- u[9] = vsubq_s32(v[1], v[9]);
- u[10] = vsubq_s32(v[2], v[10]);
- u[11] = vsubq_s32(v[3], v[11]);
- u[12] = vsubq_s32(v[4], v[12]);
- u[13] = vsubq_s32(v[5], v[13]);
- u[14] = vsubq_s32(v[6], v[14]);
- u[15] = vsubq_s32(v[7], v[15]);
-
- // stage 8
- v[0] = half_btf_neon(&cospi[2], &u[0], &cospi[62], &u[1], v_bit);
- v[1] = half_btf_neon_m(&cospi[62], &u[0], &cospi[2], &u[1], v_bit);
- v[2] = half_btf_neon(&cospi[10], &u[2], &cospi[54], &u[3], v_bit);
- v[3] = half_btf_neon_m(&cospi[54], &u[2], &cospi[10], &u[3], v_bit);
- v[4] = half_btf_neon(&cospi[18], &u[4], &cospi[46], &u[5], v_bit);
- v[5] = half_btf_neon_m(&cospi[46], &u[4], &cospi[18], &u[5], v_bit);
- v[6] = half_btf_neon(&cospi[26], &u[6], &cospi[38], &u[7], v_bit);
- v[7] = half_btf_neon_m(&cospi[38], &u[6], &cospi[26], &u[7], v_bit);
- v[8] = half_btf_neon(&cospi[34], &u[8], &cospi[30], &u[9], v_bit);
- v[9] = half_btf_neon_m(&cospi[30], &u[8], &cospi[34], &u[9], v_bit);
- v[10] = half_btf_neon(&cospi[42], &u[10], &cospi[22], &u[11], v_bit);
- v[11] = half_btf_neon_m(&cospi[22], &u[10], &cospi[42], &u[11], v_bit);
- v[12] = half_btf_neon(&cospi[50], &u[12], &cospi[14], &u[13], v_bit);
- v[13] = half_btf_neon_m(&cospi[14], &u[12], &cospi[50], &u[13], v_bit);
- v[14] = half_btf_neon(&cospi[58], &u[14], &cospi[6], &u[15], v_bit);
- v[15] = half_btf_neon_m(&cospi[6], &u[14], &cospi[58], &u[15], v_bit);
-
- // stage 9
- out[0 * num_cols + col] = v[1];
- out[1 * num_cols + col] = v[14];
- out[2 * num_cols + col] = v[3];
- out[3 * num_cols + col] = v[12];
- out[4 * num_cols + col] = v[5];
- out[5 * num_cols + col] = v[10];
- out[6 * num_cols + col] = v[7];
- out[7 * num_cols + col] = v[8];
- out[8 * num_cols + col] = v[9];
- out[9 * num_cols + col] = v[6];
- out[10 * num_cols + col] = v[11];
- out[11 * num_cols + col] = v[4];
- out[12 * num_cols + col] = v[13];
- out[13 * num_cols + col] = v[2];
- out[14 * num_cols + col] = v[15];
- out[15 * num_cols + col] = v[0];
- }
+static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+ const int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
}
-static void col_txfm_16x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
- // Note:
- // We split 16x16 rounding into 4 sections of 8x8 rounding,
- // instead of 4 columns
- col_txfm_8x8_rounding(&in[0], v_shift);
- col_txfm_8x8_rounding(&in[16], v_shift);
- col_txfm_8x8_rounding(&in[32], v_shift);
- col_txfm_8x8_rounding(&in[48], v_shift);
+static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
}
-static void col_txfm_8x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
- col_txfm_8x8_rounding(&in[0], v_shift);
- col_txfm_8x8_rounding(&in[16], v_shift);
+static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
+ int bit, int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
}
-static void write_buffer_16x16(const int32x4_t *in, int32_t *output) {
- const int size_8x8 = 16 * 4;
- write_buffer_8x8(&in[0], output);
- output += size_8x8;
- write_buffer_8x8(&in[16], output);
- output += size_8x8;
- write_buffer_8x8(&in[32], output);
- output += size_8x8;
- write_buffer_8x8(&in[48], output);
-}
-static void idtx16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
- int col_num) {
- (void)bit;
- int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
- int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
- int32x4_t a_low;
-
- int num_iters = 16 * col_num;
- for (int i = 0; i < num_iters; i++) {
- a_low = vmulq_s32(in[i], fact);
- a_low = vaddq_s32(a_low, offset);
- out[i] = vshrq_n_s32(a_low, NewSqrt2Bits);
- }
-}
void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[64], out[64];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
- const int txw_idx = get_txw_idx(TX_16X16);
- const int txh_idx = get_txh_idx(TX_16X16);
- const int col_num = 4;
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+ (void)bd;
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Workspaces for column/row-wise transforms.
+ int32x4_t buf0[64], buf1[64];
+
switch (tx_type) {
case DCT_DCT:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case ADST_DCT:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case DCT_ADST:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case ADST_ADST:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case FLIPADST_DCT:
- load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case DCT_FLIPADST:
- load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case FLIPADST_FLIPADST:
- load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case ADST_FLIPADST:
- load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case FLIPADST_ADST:
- load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case IDTX:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case V_DCT:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case H_DCT:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case V_ADST:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case H_ADST:
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case V_FLIPADST:
- load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
case H_FLIPADST:
- load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
- idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
- col_txfm_16x16_rounding(out, &v_shift);
- transpose_16x16(out, in);
- fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
- write_buffer_16x16(out, coeff);
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
break;
default: assert(0);
}
- (void)bd;
}
-static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
- for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
- for (int i = 1; i < size; i += 2) in[size - i] = out[i];
-}
+typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
+ int stride, int bit, int lr_flip);
+typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
+ int32x4_t *out, int stride,
+ int bit, int lr_flip,
+ int howmany, int hm_stride);
+
+typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
+ int bit, int stride);
+typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
+ int32_t *out, int bit,
+ int howmany, int hm_stride,
+ int stride);
+
+// Construct component kernels that include the load_buffer and store_buffer
+// stages to avoid the need to spill loaded data to the stack between these and
+// the txfm kernel calls.
+// The TRANSFORM_*_ONE cases are only ever called in situations where the
+// howmany parameter would be one, so no need for the loop at all in these
+// cases.
+
+#define TRANSFORM_COL_ONE(name, n) \
+ static void highbd_##name##_col_neon(const int16_t *input, \
+ int32x4_t *output, int stride, \
+ int cos_bit, int lr_flip) { \
+ int32x4_t buf0[n]; \
+ load_buffer_4x##n(input, buf0, stride, lr_flip); \
+ highbd_##name##_x4_neon(buf0, output, cos_bit); \
+ }
+
+#define TRANSFORM_COL_MANY(name, n) \
+ static void highbd_##name##_col_many_neon( \
+ const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
+ int lr_flip, int howmany, int hm_stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \
+ highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \
+ } while (++i < howmany); \
+ }
+
+#define TRANSFORM_ROW_ONE(name, n) \
+ static void highbd_##name##_row_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input, buf0, cos_bit); \
+ store_buffer_##n##x4(buf0, output, stride); \
+ }
+
+#define TRANSFORM_ROW_RECT_ONE(name, n) \
+ static void highbd_##name##_row_rect_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input, buf0, cos_bit); \
+ round_rect_array_s32_neon(buf0, buf0, (n)); \
+ store_buffer_##n##x4(buf0, output, stride); \
+ }
+
+#define TRANSFORM_ROW_MANY(name, n) \
+ static void highbd_##name##_row_many_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+ int hm_stride, int stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
+ store_buffer_##n##x4(buf0, output + 4 * i, stride); \
+ } while (++i < howmany); \
+ }
+
+#define TRANSFORM_ROW_RECT_MANY(name, n) \
+ static void highbd_##name##_row_rect_many_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+ int hm_stride, int stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
+ round_rect_array_s32_neon(buf0, buf0, (n)); \
+ store_buffer_##n##x4(buf0, output + 4 * i, stride); \
+ } while (++i < howmany); \
+ }
+
+TRANSFORM_COL_ONE(fdct8, 8)
+TRANSFORM_COL_ONE(fadst8, 8)
+TRANSFORM_COL_ONE(fidentity8, 8)
+
+TRANSFORM_COL_MANY(fdct4, 4)
+TRANSFORM_COL_MANY(fdct8, 8)
+TRANSFORM_COL_MANY(fdct16, 16)
+TRANSFORM_COL_MANY(fadst4, 4)
+TRANSFORM_COL_MANY(fadst8, 8)
+TRANSFORM_COL_MANY(fadst16, 16)
+TRANSFORM_COL_MANY(fidentity4, 4)
+TRANSFORM_COL_MANY(fidentity8, 8)
+TRANSFORM_COL_MANY(fidentity16, 16)
+
+TRANSFORM_ROW_ONE(fdct16, 16)
+TRANSFORM_ROW_ONE(fadst16, 16)
+TRANSFORM_ROW_ONE(fidentity16, 16)
+
+TRANSFORM_ROW_RECT_ONE(fdct8, 8)
+TRANSFORM_ROW_RECT_ONE(fadst8, 8)
+TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
-typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
- const int num_cols);
-
-static const fwd_transform_1d_neon col_highbd_txfm8x8_arr[TX_TYPES] = {
- fdct8x8_neon, // DCT_DCT
- fadst8x8_neon, // ADST_DCT
- fdct8x8_neon, // DCT_ADST
- fadst8x8_neon, // ADST_ADST
- fadst8x8_neon, // FLIPADST_DCT
- fdct8x8_neon, // DCT_FLIPADST
- fadst8x8_neon, // FLIPADST_FLIPADST
- fadst8x8_neon, // ADST_FLIPADST
- fadst8x8_neon, // FLIPADST_ADST
- idtx8x8_neon, // IDTX
- fdct8x8_neon, // V_DCT
- idtx8x8_neon, // H_DCT
- fadst8x8_neon, // V_ADST
- idtx8x8_neon, // H_ADST
- fadst8x8_neon, // V_FLIPADST
- idtx8x8_neon // H_FLIPADST
-};
#if !CONFIG_REALTIME_ONLY
-static const fwd_transform_1d_neon row_highbd_txfm32x8_arr[TX_TYPES] = {
- fdct8x8_neon, // DCT_DCT
- NULL, // ADST_DCT
- NULL, // DCT_ADST
- NULL, // ADST_ADST
- NULL, // FLIPADST_DCT
- NULL, // DCT_FLIPADST
- NULL, // FLIPADST_FLIPADST
- NULL, // ADST_FLIPADST
- NULL, // FLIPADST-ADST
- idtx32x8_neon, // IDTX
- NULL, // V_DCT
- NULL, // H_DCT
- NULL, // V_ADST
- NULL, // H_ADST
- NULL, // V_FLIPADST
- NULL, // H_FLIPADST
-};
+TRANSFORM_ROW_MANY(fdct4, 4)
+TRANSFORM_ROW_MANY(fdct8, 8)
+TRANSFORM_ROW_MANY(fadst4, 4)
+TRANSFORM_ROW_MANY(fadst8, 8)
+TRANSFORM_ROW_MANY(fidentity4, 4)
+TRANSFORM_ROW_MANY(fidentity8, 8)
#endif
-static const fwd_transform_1d_neon col_highbd_txfm4x8_arr[TX_TYPES] = {
- fdct4x8_neon, // DCT_DCT
- fadst8x8_neon, // ADST_DCT
- fdct4x8_neon, // DCT_ADST
- fadst8x8_neon, // ADST_ADST
- fadst8x8_neon, // FLIPADST_DCT
- fdct4x8_neon, // DCT_FLIPADST
- fadst8x8_neon, // FLIPADST_FLIPADST
- fadst8x8_neon, // ADST_FLIPADST
- fadst8x8_neon, // FLIPADST_ADST
- idtx8x8_neon, // IDTX
- fdct4x8_neon, // V_DCT
- idtx8x8_neon, // H_DCT
- fadst8x8_neon, // V_ADST
- idtx8x8_neon, // H_ADST
- fadst8x8_neon, // V_FLIPADST
- idtx8x8_neon // H_FLIPADST
-};
-static const fwd_transform_1d_neon row_highbd_txfm8x16_arr[TX_TYPES] = {
- fdct16x16_neon, // DCT_DCT
- fdct16x16_neon, // ADST_DCT
- fadst16x16_neon, // DCT_ADST
- fadst16x16_neon, // ADST_ADST
- fdct16x16_neon, // FLIPADST_DCT
- fadst16x16_neon, // DCT_FLIPADST
- fadst16x16_neon, // FLIPADST_FLIPADST
- fadst16x16_neon, // ADST_FLIPADST
- fadst16x16_neon, // FLIPADST_ADST
- idtx16x16_neon, // IDTX
- idtx16x16_neon, // V_DCT
- fdct16x16_neon, // H_DCT
- idtx16x16_neon, // V_ADST
- fadst16x16_neon, // H_ADST
- idtx16x16_neon, // V_FLIPADST
- fadst16x16_neon // H_FLIPADST
+TRANSFORM_ROW_RECT_MANY(fdct4, 4)
+TRANSFORM_ROW_RECT_MANY(fdct8, 8)
+TRANSFORM_ROW_RECT_MANY(fdct16, 16)
+TRANSFORM_ROW_RECT_MANY(fadst4, 4)
+TRANSFORM_ROW_RECT_MANY(fadst8, 8)
+TRANSFORM_ROW_RECT_MANY(fadst16, 16)
+TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
+TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
+TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_col_many_neon, // DCT_DCT
+ highbd_fadst8_col_many_neon, // ADST_DCT
+ highbd_fdct8_col_many_neon, // DCT_ADST
+ highbd_fadst8_col_many_neon, // ADST_ADST
+ highbd_fadst8_col_many_neon, // FLIPADST_DCT
+ highbd_fdct8_col_many_neon, // DCT_FLIPADST
+ highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_col_many_neon, // ADST_FLIPADST
+ highbd_fadst8_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_col_many_neon, // IDTX
+ highbd_fdct8_col_many_neon, // V_DCT
+ highbd_fidentity8_col_many_neon, // H_DCT
+ highbd_fadst8_col_many_neon, // V_ADST
+ highbd_fidentity8_col_many_neon, // H_ADST
+ highbd_fadst8_col_many_neon, // V_FLIPADST
+ highbd_fidentity8_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
+ highbd_fdct8_col_neon, // DCT_DCT
+ highbd_fadst8_col_neon, // ADST_DCT
+ highbd_fdct8_col_neon, // DCT_ADST
+ highbd_fadst8_col_neon, // ADST_ADST
+ highbd_fadst8_col_neon, // FLIPADST_DCT
+ highbd_fdct8_col_neon, // DCT_FLIPADST
+ highbd_fadst8_col_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_col_neon, // ADST_FLIPADST
+ highbd_fadst8_col_neon, // FLIPADST_ADST
+ highbd_fidentity8_col_neon, // IDTX
+ highbd_fdct8_col_neon, // V_DCT
+ highbd_fidentity8_col_neon, // H_DCT
+ highbd_fadst8_col_neon, // V_ADST
+ highbd_fidentity8_col_neon, // H_ADST
+ highbd_fadst8_col_neon, // V_FLIPADST
+ highbd_fidentity8_col_neon // H_FLIPADST
};
-static const fwd_transform_1d_neon col_highbd_txfm8x16_arr[TX_TYPES] = {
- fdct16x16_neon, // DCT_DCT
- fadst16x16_neon, // ADST_DCT
- fdct16x16_neon, // DCT_ADST
- fadst16x16_neon, // ADST_ADST
- fadst16x16_neon, // FLIPADST_DCT
- fdct16x16_neon, // DCT_FLIPADST
- fadst16x16_neon, // FLIPADST_FLIPADST
- fadst16x16_neon, // ADST_FLIPADST
- fadst16x16_neon, // FLIPADST_ADST
- idtx16x16_neon, // IDTX
- fdct16x16_neon, // V_DCT
- idtx16x16_neon, // H_DCT
- fadst16x16_neon, // V_ADST
- idtx16x16_neon, // H_ADST
- fadst16x16_neon, // V_FLIPADST
- idtx16x16_neon // H_FLIPADST
-};
-static const fwd_transform_1d_neon row_highbd_txfm8x8_arr[TX_TYPES] = {
- fdct8x8_neon, // DCT_DCT
- fdct8x8_neon, // ADST_DCT
- fadst8x8_neon, // DCT_ADST
- fadst8x8_neon, // ADST_ADST
- fdct8x8_neon, // FLIPADST_DCT
- fadst8x8_neon, // DCT_FLIPADST
- fadst8x8_neon, // FLIPADST_FLIPADST
- fadst8x8_neon, // ADST_FLIPADST
- fadst8x8_neon, // FLIPADST_ADST
- idtx8x8_neon, // IDTX
- idtx8x8_neon, // V_DCT
- fdct8x8_neon, // H_DCT
- idtx8x8_neon, // V_ADST
- fadst8x8_neon, // H_ADST
- idtx8x8_neon, // V_FLIPADST
- fadst8x8_neon // H_FLIPADST
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_col_many_neon, // DCT_DCT
+ highbd_fadst16_col_many_neon, // ADST_DCT
+ highbd_fdct16_col_many_neon, // DCT_ADST
+ highbd_fadst16_col_many_neon, // ADST_ADST
+ highbd_fadst16_col_many_neon, // FLIPADST_DCT
+ highbd_fdct16_col_many_neon, // DCT_FLIPADST
+ highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_col_many_neon, // ADST_FLIPADST
+ highbd_fadst16_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity16_col_many_neon, // IDTX
+ highbd_fdct16_col_many_neon, // V_DCT
+ highbd_fidentity16_col_many_neon, // H_DCT
+ highbd_fadst16_col_many_neon, // V_ADST
+ highbd_fidentity16_col_many_neon, // H_ADST
+ highbd_fadst16_col_many_neon, // V_FLIPADST
+ highbd_fidentity16_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_col_many_neon, // DCT_DCT
+ highbd_fadst4_col_many_neon, // ADST_DCT
+ highbd_fdct4_col_many_neon, // DCT_ADST
+ highbd_fadst4_col_many_neon, // ADST_ADST
+ highbd_fadst4_col_many_neon, // FLIPADST_DCT
+ highbd_fdct4_col_many_neon, // DCT_FLIPADST
+ highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_col_many_neon, // ADST_FLIPADST
+ highbd_fadst4_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_col_many_neon, // IDTX
+ highbd_fdct4_col_many_neon, // V_DCT
+ highbd_fidentity4_col_many_neon, // H_DCT
+ highbd_fadst4_col_many_neon, // V_ADST
+ highbd_fidentity4_col_many_neon, // H_ADST
+ highbd_fadst4_col_many_neon, // V_FLIPADST
+ highbd_fidentity4_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_row_neon, // DCT_DCT
+ highbd_fdct16_row_neon, // ADST_DCT
+ highbd_fadst16_row_neon, // DCT_ADST
+ highbd_fadst16_row_neon, // ADST_ADST
+ highbd_fdct16_row_neon, // FLIPADST_DCT
+ highbd_fadst16_row_neon, // DCT_FLIPADST
+ highbd_fadst16_row_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_row_neon, // ADST_FLIPADST
+ highbd_fadst16_row_neon, // FLIPADST_ADST
+ highbd_fidentity16_row_neon, // IDTX
+ highbd_fidentity16_row_neon, // V_DCT
+ highbd_fdct16_row_neon, // H_DCT
+ highbd_fidentity16_row_neon, // V_ADST
+ highbd_fadst16_row_neon, // H_ADST
+ highbd_fidentity16_row_neon, // V_FLIPADST
+ highbd_fadst16_row_neon // H_FLIPADST
};
-static const fwd_transform_1d_neon row_highbd_txfm4x8_arr[TX_TYPES] = {
- fdct4x8_neon, // DCT_DCT
- fdct4x8_neon, // ADST_DCT
- fadst8x8_neon, // DCT_ADST
- fadst8x8_neon, // ADST_ADST
- fdct4x8_neon, // FLIPADST_DCT
- fadst8x8_neon, // DCT_FLIPADST
- fadst8x8_neon, // FLIPADST_FLIPADST
- fadst8x8_neon, // ADST_FLIPADST
- fadst8x8_neon, // FLIPADST_ADST
- idtx8x8_neon, // IDTX
- idtx8x8_neon, // V_DCT
- fdct4x8_neon, // H_DCT
- idtx8x8_neon, // V_ADST
- fadst8x8_neon, // H_ADST
- idtx8x8_neon, // V_FLIPADST
- fadst8x8_neon // H_FLIPADST
-};
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_row_rect_many_neon, // DCT_DCT
+ highbd_fdct16_row_rect_many_neon, // ADST_DCT
+ highbd_fadst16_row_rect_many_neon, // DCT_ADST
+ highbd_fadst16_row_rect_many_neon, // ADST_ADST
+ highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity16_row_rect_many_neon, // IDTX
+ highbd_fidentity16_row_rect_many_neon, // V_DCT
+ highbd_fdct16_row_rect_many_neon, // H_DCT
+ highbd_fidentity16_row_rect_many_neon, // V_ADST
+ highbd_fadst16_row_rect_many_neon, // H_ADST
+ highbd_fidentity16_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst16_row_rect_many_neon // H_FLIPADST
+ };
-static const fwd_transform_1d_neon row_highbd_txfm4x4_arr[TX_TYPES] = {
- fdct4x4_neon, // DCT_DCT
- fdct4x4_neon, // ADST_DCT
- fadst4x4_neon, // DCT_ADST
- fadst4x4_neon, // ADST_ADST
- fdct4x4_neon, // FLIPADST_DCT
- fadst4x4_neon, // DCT_FLIPADST
- fadst4x4_neon, // FLIPADST_FLIPADST
- fadst4x4_neon, // ADST_FLIPADST
- fadst4x4_neon, // FLIPADST_ADST
- idtx4x4_neon, // IDTX
- idtx4x4_neon, // V_DCT
- fdct4x4_neon, // H_DCT
- idtx4x4_neon, // V_ADST
- fadst4x4_neon, // H_ADST
- idtx4x4_neon, // V_FLIPADST
- fadst4x4_neon // H_FLIPADST
-};
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_row_many_neon, // DCT_DCT
+ highbd_fdct8_row_many_neon, // ADST_DCT
+ highbd_fadst8_row_many_neon, // DCT_ADST
+ highbd_fadst8_row_many_neon, // ADST_ADST
+ highbd_fdct8_row_many_neon, // FLIPADST_DCT
+ highbd_fadst8_row_many_neon, // DCT_FLIPADST
+ highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_many_neon, // ADST_FLIPADST
+ highbd_fadst8_row_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_many_neon, // IDTX
+ highbd_fidentity8_row_many_neon, // V_DCT
+ highbd_fdct8_row_many_neon, // H_DCT
+ highbd_fidentity8_row_many_neon, // V_ADST
+ highbd_fadst8_row_many_neon, // H_ADST
+ highbd_fidentity8_row_many_neon, // V_FLIPADST
+ highbd_fadst8_row_many_neon // H_FLIPADST
+ };
+#endif
-static const fwd_transform_1d_neon col_highbd_txfm4x4_arr[TX_TYPES] = {
- fdct4x4_neon, // DCT_DCT
- fadst4x4_neon, // ADST_DCT
- fdct4x4_neon, // DCT_ADST
- fadst4x4_neon, // ADST_ADST
- fadst4x4_neon, // FLIPADST_DCT
- fdct4x4_neon, // DCT_FLIPADST
- fadst4x4_neon, // FLIPADST_FLIPADST
- fadst4x4_neon, // ADST_FLIPADST
- fadst4x4_neon, // FLIPADST_ADST
- idtx4x4_neon, // IDTX
- fdct4x4_neon, // V_DCT
- idtx4x4_neon, // H_DCT
- fadst4x4_neon, // V_ADST
- idtx4x4_neon, // H_ADST
- fadst4x4_neon, // V_FLIPADST
- idtx4x4_neon // H_FLIPADST
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_row_rect_many_neon, // DCT_DCT
+ highbd_fdct8_row_rect_many_neon, // ADST_DCT
+ highbd_fadst8_row_rect_many_neon, // DCT_ADST
+ highbd_fadst8_row_rect_many_neon, // ADST_ADST
+ highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_rect_many_neon, // IDTX
+ highbd_fidentity8_row_rect_many_neon, // V_DCT
+ highbd_fdct8_row_rect_many_neon, // H_DCT
+ highbd_fidentity8_row_rect_many_neon, // V_ADST
+ highbd_fadst8_row_rect_many_neon, // H_ADST
+ highbd_fidentity8_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst8_row_rect_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
+ highbd_fdct8_row_rect_neon, // DCT_DCT
+ highbd_fdct8_row_rect_neon, // ADST_DCT
+ highbd_fadst8_row_rect_neon, // DCT_ADST
+ highbd_fadst8_row_rect_neon, // ADST_ADST
+ highbd_fdct8_row_rect_neon, // FLIPADST_DCT
+ highbd_fadst8_row_rect_neon, // DCT_FLIPADST
+ highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_rect_neon, // ADST_FLIPADST
+ highbd_fadst8_row_rect_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_rect_neon, // IDTX
+ highbd_fidentity8_row_rect_neon, // V_DCT
+ highbd_fdct8_row_rect_neon, // H_DCT
+ highbd_fidentity8_row_rect_neon, // V_ADST
+ highbd_fadst8_row_rect_neon, // H_ADST
+ highbd_fidentity8_row_rect_neon, // V_FLIPADST
+ highbd_fadst8_row_rect_neon // H_FLIPADST
};
-void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
- const int stride) {
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_row_many_neon, // DCT_DCT
+ highbd_fdct4_row_many_neon, // ADST_DCT
+ highbd_fadst4_row_many_neon, // DCT_ADST
+ highbd_fadst4_row_many_neon, // ADST_ADST
+ highbd_fdct4_row_many_neon, // FLIPADST_DCT
+ highbd_fadst4_row_many_neon, // DCT_FLIPADST
+ highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_row_many_neon, // ADST_FLIPADST
+ highbd_fadst4_row_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_row_many_neon, // IDTX
+ highbd_fidentity4_row_many_neon, // V_DCT
+ highbd_fdct4_row_many_neon, // H_DCT
+ highbd_fidentity4_row_many_neon, // V_ADST
+ highbd_fadst4_row_many_neon, // H_ADST
+ highbd_fidentity4_row_many_neon, // V_FLIPADST
+ highbd_fadst4_row_many_neon // H_FLIPADST
+ };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_row_rect_many_neon, // DCT_DCT
+ highbd_fdct4_row_rect_many_neon, // ADST_DCT
+ highbd_fadst4_row_rect_many_neon, // DCT_ADST
+ highbd_fadst4_row_rect_many_neon, // ADST_ADST
+ highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_row_rect_many_neon, // IDTX
+ highbd_fidentity4_row_rect_many_neon, // V_DCT
+ highbd_fdct4_row_rect_many_neon, // H_DCT
+ highbd_fidentity4_row_rect_many_neon, // V_ADST
+ highbd_fadst4_row_rect_many_neon, // H_ADST
+ highbd_fidentity4_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst4_row_rect_many_neon // H_FLIPADST
+ };
+
+static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int32_t *const cospi = cospi_arr_s32(cos_bit);
+ const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+ // Workspaces for intermediate transform steps.
int32x4_t buf0[32];
int32x4_t buf1[32];
- const int32_t *cospi;
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
- int startidx = 0 * stride;
- int endidx = 31 * stride;
- // stage 0
// stage 1
- buf1[0] = vaddq_s32(input[startidx], input[endidx]);
- buf1[31] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[1] = vaddq_s32(input[startidx], input[endidx]);
- buf1[30] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[2] = vaddq_s32(input[startidx], input[endidx]);
- buf1[29] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[3] = vaddq_s32(input[startidx], input[endidx]);
- buf1[28] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[4] = vaddq_s32(input[startidx], input[endidx]);
- buf1[27] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[5] = vaddq_s32(input[startidx], input[endidx]);
- buf1[26] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[6] = vaddq_s32(input[startidx], input[endidx]);
- buf1[25] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[7] = vaddq_s32(input[startidx], input[endidx]);
- buf1[24] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[8] = vaddq_s32(input[startidx], input[endidx]);
- buf1[23] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[9] = vaddq_s32(input[startidx], input[endidx]);
- buf1[22] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[10] = vaddq_s32(input[startidx], input[endidx]);
- buf1[21] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[11] = vaddq_s32(input[startidx], input[endidx]);
- buf1[20] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[12] = vaddq_s32(input[startidx], input[endidx]);
- buf1[19] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[13] = vaddq_s32(input[startidx], input[endidx]);
- buf1[18] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[14] = vaddq_s32(input[startidx], input[endidx]);
- buf1[17] = vsubq_s32(input[startidx], input[endidx]);
- startidx += stride;
- endidx -= stride;
- buf1[15] = vaddq_s32(input[startidx], input[endidx]);
- buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+ butterfly_dct_pre(input, buf1, 32);
// stage 2
- cospi = cospi_arr(cos_bit);
- buf0[0] = vaddq_s32(buf1[0], buf1[15]);
- buf0[15] = vsubq_s32(buf1[0], buf1[15]);
- buf0[1] = vaddq_s32(buf1[1], buf1[14]);
- buf0[14] = vsubq_s32(buf1[1], buf1[14]);
- buf0[2] = vaddq_s32(buf1[2], buf1[13]);
- buf0[13] = vsubq_s32(buf1[2], buf1[13]);
- buf0[3] = vaddq_s32(buf1[3], buf1[12]);
- buf0[12] = vsubq_s32(buf1[3], buf1[12]);
- buf0[4] = vaddq_s32(buf1[4], buf1[11]);
- buf0[11] = vsubq_s32(buf1[4], buf1[11]);
- buf0[5] = vaddq_s32(buf1[5], buf1[10]);
- buf0[10] = vsubq_s32(buf1[5], buf1[10]);
- buf0[6] = vaddq_s32(buf1[6], buf1[9]);
- buf0[9] = vsubq_s32(buf1[6], buf1[9]);
- buf0[7] = vaddq_s32(buf1[7], buf1[8]);
- buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+ butterfly_dct_pre(buf1, buf0, 16);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
buf0[18] = buf1[18];
buf0[19] = buf1[19];
- btf_32_neon_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
- buf0[27], v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
- buf0[25], v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
- buf0[24], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
+ v_cos_bit);
buf0[28] = buf1[28];
buf0[29] = buf1[29];
buf0[30] = buf1[30];
buf0[31] = buf1[31];
// stage 3
- cospi = cospi_arr(cos_bit);
- buf1[0] = vaddq_s32(buf0[0], buf0[7]);
- buf1[7] = vsubq_s32(buf0[0], buf0[7]);
- buf1[1] = vaddq_s32(buf0[1], buf0[6]);
- buf1[6] = vsubq_s32(buf0[1], buf0[6]);
- buf1[2] = vaddq_s32(buf0[2], buf0[5]);
- buf1[5] = vsubq_s32(buf0[2], buf0[5]);
- buf1[3] = vaddq_s32(buf0[3], buf0[4]);
- buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+ butterfly_dct_pre(buf0, buf1, 8);
buf1[8] = buf0[8];
buf1[9] = buf0[9];
- btf_32_neon_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
- buf1[13], v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
- buf1[12], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
+ v_cos_bit);
buf1[14] = buf0[14];
buf1[15] = buf0[15];
- buf1[16] = vaddq_s32(buf0[16], buf0[23]);
- buf1[23] = vsubq_s32(buf0[16], buf0[23]);
- buf1[17] = vaddq_s32(buf0[17], buf0[22]);
- buf1[22] = vsubq_s32(buf0[17], buf0[22]);
- buf1[18] = vaddq_s32(buf0[18], buf0[21]);
- buf1[21] = vsubq_s32(buf0[18], buf0[21]);
- buf1[19] = vaddq_s32(buf0[19], buf0[20]);
- buf1[20] = vsubq_s32(buf0[19], buf0[20]);
- buf1[24] = vsubq_s32(buf0[31], buf0[24]);
- buf1[31] = vaddq_s32(buf0[31], buf0[24]);
- buf1[25] = vsubq_s32(buf0[30], buf0[25]);
- buf1[30] = vaddq_s32(buf0[30], buf0[25]);
- buf1[26] = vsubq_s32(buf0[29], buf0[26]);
- buf1[29] = vaddq_s32(buf0[29], buf0[26]);
- buf1[27] = vsubq_s32(buf0[28], buf0[27]);
- buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
// stage 4
- cospi = cospi_arr(cos_bit);
- buf0[0] = vaddq_s32(buf1[0], buf1[3]);
- buf0[3] = vsubq_s32(buf1[0], buf1[3]);
- buf0[1] = vaddq_s32(buf1[1], buf1[2]);
- buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+ butterfly_dct_pre(buf1, buf0, 4);
buf0[4] = buf1[4];
- btf_32_neon_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
- v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
+ v_cos_bit);
buf0[7] = buf1[7];
- buf0[8] = vaddq_s32(buf1[8], buf1[11]);
- buf0[11] = vsubq_s32(buf1[8], buf1[11]);
- buf0[9] = vaddq_s32(buf1[9], buf1[10]);
- buf0[10] = vsubq_s32(buf1[9], buf1[10]);
- buf0[12] = vsubq_s32(buf1[15], buf1[12]);
- buf0[15] = vaddq_s32(buf1[15], buf1[12]);
- buf0[13] = vsubq_s32(buf1[14], buf1[13]);
- buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+ butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
-
- btf_32_neon_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
- buf0[29], v_cos_bit);
- btf_32_neon_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
- buf0[28], v_cos_bit);
-
- btf_32_neon_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
- buf0[27], v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
-
+ butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
+ v_cos_bit);
buf0[22] = buf1[22];
buf0[23] = buf1[23];
buf0[24] = buf1[24];
@@ -2126,72 +1600,46 @@ void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
buf0[31] = buf1[31];
// stage 5
- btf_32_neon_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
- v_cos_bit);
-
- btf_32_neon_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
- v_cos_bit);
- buf1[4] = vaddq_s32(buf0[4], buf0[5]);
- buf1[5] = vsubq_s32(buf0[4], buf0[5]);
- buf1[6] = vsubq_s32(buf0[7], buf0[6]);
- buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+ butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
+ v_cos_bit);
+ butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
buf1[8] = buf0[8];
- btf_32_neon_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
- v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
- buf1[13], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
+ v_cos_bit);
buf1[11] = buf0[11];
buf1[12] = buf0[12];
buf1[15] = buf0[15];
- buf1[16] = vaddq_s32(buf0[16], buf0[19]);
- buf1[19] = vsubq_s32(buf0[16], buf0[19]);
- buf1[17] = vaddq_s32(buf0[17], buf0[18]);
- buf1[18] = vsubq_s32(buf0[17], buf0[18]);
- buf1[20] = vsubq_s32(buf0[23], buf0[20]);
- buf1[23] = vaddq_s32(buf0[23], buf0[20]);
- buf1[21] = vsubq_s32(buf0[22], buf0[21]);
- buf1[22] = vaddq_s32(buf0[22], buf0[21]);
- buf1[24] = vaddq_s32(buf0[24], buf0[27]);
- buf1[27] = vsubq_s32(buf0[24], buf0[27]);
- buf1[25] = vaddq_s32(buf0[25], buf0[26]);
- buf1[26] = vsubq_s32(buf0[25], buf0[26]);
- buf1[28] = vsubq_s32(buf0[31], buf0[28]);
- buf1[31] = vaddq_s32(buf0[31], buf0[28]);
- buf1[29] = vsubq_s32(buf0[30], buf0[29]);
- buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+ butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
// stage 6
- cospi = cospi_arr(cos_bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
- btf_32_neon_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
- v_cos_bit);
- btf_32_neon_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
- buf0[30], v_cos_bit);
- btf_32_neon_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
- buf0[29], v_cos_bit);
-
- buf0[8] = vaddq_s32(buf1[8], buf1[9]);
- buf0[9] = vsubq_s32(buf1[8], buf1[9]);
- buf0[10] = vsubq_s32(buf1[11], buf1[10]);
- buf0[11] = vaddq_s32(buf1[11], buf1[10]);
- buf0[12] = vaddq_s32(buf1[12], buf1[13]);
- buf0[13] = vsubq_s32(buf1[12], buf1[13]);
- buf0[14] = vsubq_s32(buf1[15], buf1[14]);
- buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+ butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
+ v_cos_bit);
+ butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+ butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
buf0[16] = buf1[16];
buf0[19] = buf1[19];
buf0[20] = buf1[20];
- btf_32_neon_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
- v_cos_bit);
- btf_32_neon_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
- btf_32_neon_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
- buf0[25], v_cos_bit);
+ butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
+ v_cos_bit);
+ butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
+ v_cos_bit);
buf0[23] = buf1[23];
buf0[24] = buf1[24];
@@ -2200,7 +1648,6 @@ void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
buf0[31] = buf1[31];
// stage 7
- cospi = cospi_arr(cos_bit);
buf1[0] = buf0[0];
buf1[1] = buf0[1];
buf1[2] = buf0[2];
@@ -2209,33 +1656,20 @@ void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
buf1[5] = buf0[5];
buf1[6] = buf0[6];
buf1[7] = buf0[7];
- btf_32_neon_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
- v_cos_bit);
- btf_32_neon_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
- v_cos_bit);
- btf_32_neon_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
- buf1[13], v_cos_bit);
- btf_32_neon_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
- buf1[12], v_cos_bit);
- buf1[16] = vaddq_s32(buf0[16], buf0[17]);
- buf1[17] = vsubq_s32(buf0[16], buf0[17]);
- buf1[18] = vsubq_s32(buf0[19], buf0[18]);
- buf1[19] = vaddq_s32(buf0[19], buf0[18]);
- buf1[20] = vaddq_s32(buf0[20], buf0[21]);
- buf1[21] = vsubq_s32(buf0[20], buf0[21]);
- buf1[22] = vsubq_s32(buf0[23], buf0[22]);
- buf1[23] = vaddq_s32(buf0[23], buf0[22]);
- buf1[24] = vaddq_s32(buf0[24], buf0[25]);
- buf1[25] = vsubq_s32(buf0[24], buf0[25]);
- buf1[26] = vsubq_s32(buf0[27], buf0[26]);
- buf1[27] = vaddq_s32(buf0[27], buf0[26]);
- buf1[28] = vaddq_s32(buf0[28], buf0[29]);
- buf1[29] = vsubq_s32(buf0[28], buf0[29]);
- buf1[30] = vsubq_s32(buf0[31], buf0[30]);
- buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+ butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
+ v_cos_bit);
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+ butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+ butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+ butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
// stage 8
- cospi = cospi_arr(cos_bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
@@ -2252,313 +1686,70 @@ void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
buf0[13] = buf1[13];
buf0[14] = buf1[14];
buf0[15] = buf1[15];
- btf_32_neon_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
- v_cos_bit);
- btf_32_neon_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
- buf0[30], v_cos_bit);
- btf_32_neon_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
- buf0[29], v_cos_bit);
- btf_32_neon_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
- buf0[28], v_cos_bit);
- btf_32_neon_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
- buf0[27], v_cos_bit);
- btf_32_neon_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
- buf0[26], v_cos_bit);
- btf_32_neon_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
- buf0[25], v_cos_bit);
- btf_32_neon_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
- v_cos_bit);
-
- startidx = 0 * stride;
- endidx = 31 * stride;
- // stage 9
- output[startidx] = buf0[0];
- output[endidx] = buf0[31];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[16];
- output[endidx] = buf0[15];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[8];
- output[endidx] = buf0[23];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[24];
- output[endidx] = buf0[7];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[4];
- output[endidx] = buf0[27];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[20];
- output[endidx] = buf0[11];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[12];
- output[endidx] = buf0[19];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[28];
- output[endidx] = buf0[3];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[2];
- output[endidx] = buf0[29];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[18];
- output[endidx] = buf0[13];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[10];
- output[endidx] = buf0[21];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[26];
- output[endidx] = buf0[5];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[6];
- output[endidx] = buf0[25];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[22];
- output[endidx] = buf0[9];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[14];
- output[endidx] = buf0[17];
- startidx += stride;
- endidx -= stride;
- output[startidx] = buf0[30];
- output[endidx] = buf0[1];
-}
-
-void av1_fadst4_new_neon(const int32x4_t *input, int32x4_t *output,
- const int8_t cos_bit, const int8_t *stage_range) {
- const int txfm_size = 4;
- const int num_per_128 = 4;
- const int32_t *cospi;
- int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
- int32x4_t buf0[4];
- int32x4_t buf1[4];
- int col_num = txfm_size / num_per_128;
- int col;
- (void)stage_range;
- cospi = cospi_arr(cos_bit);
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int j;
- for (j = 0; j < 4; ++j) {
- buf0[j] = input[j * col_num + col];
- }
-
- // stage 1
- buf1[0] = buf0[3];
- buf1[1] = buf0[0];
- buf1[2] = buf0[1];
- buf1[3] = buf0[2];
-
- // stage 2
- btf_32_neon_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+ butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
v_cos_bit);
- btf_32_neon_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3],
+ butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
v_cos_bit);
-
- // stage 3
- buf1[0] = vaddq_s32(buf0[0], buf0[2]);
- buf1[2] = vsubq_s32(buf0[0], buf0[2]);
- buf1[1] = vaddq_s32(buf0[1], buf0[3]);
- buf1[3] = vsubq_s32(buf0[1], buf0[3]);
-
- // stage 4
- cospi = cospi_arr(cos_bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
-
- btf_32_neon_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
+ butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
v_cos_bit);
- // stage 5
- buf1[0] = buf0[0];
- buf1[1] = vnegq_s32(buf0[2]);
- buf1[2] = buf0[3];
- buf1[3] = vnegq_s32(buf0[1]);
-
- for (j = 0; j < 4; ++j) {
- output[j * col_num + col] = buf1[j];
- }
- }
-}
+ // stage 9
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
+}
+
+static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
+ int8_t cos_bit) {
+ const int32_t *const cospi = cospi_arr_s32(cos_bit);
+ const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-static void av1_fdct64_new_stage12345_neon(int32x4_t *input, const int instride,
- int32x4_t *x5, const int32_t *cospi,
- const int32x4_t *v_cos_bit,
- int *startidx, int *endidx) {
+ // stage 1
int32x4_t x1[64];
- x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
- *startidx += instride;
- *endidx -= instride;
- x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
- x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+ butterfly_dct_pre(input, x1, 64);
// stage 2
int32x4_t x2[64];
- x2[0] = vaddq_s32(x1[0], x1[31]);
- x2[31] = vsubq_s32(x1[0], x1[31]);
- x2[1] = vaddq_s32(x1[1], x1[30]);
- x2[30] = vsubq_s32(x1[1], x1[30]);
- x2[2] = vaddq_s32(x1[2], x1[29]);
- x2[29] = vsubq_s32(x1[2], x1[29]);
- x2[3] = vaddq_s32(x1[3], x1[28]);
- x2[28] = vsubq_s32(x1[3], x1[28]);
- x2[4] = vaddq_s32(x1[4], x1[27]);
- x2[27] = vsubq_s32(x1[4], x1[27]);
- x2[5] = vaddq_s32(x1[5], x1[26]);
- x2[26] = vsubq_s32(x1[5], x1[26]);
- x2[6] = vaddq_s32(x1[6], x1[25]);
- x2[25] = vsubq_s32(x1[6], x1[25]);
- x2[7] = vaddq_s32(x1[7], x1[24]);
- x2[24] = vsubq_s32(x1[7], x1[24]);
- x2[8] = vaddq_s32(x1[8], x1[23]);
- x2[23] = vsubq_s32(x1[8], x1[23]);
- x2[9] = vaddq_s32(x1[9], x1[22]);
- x2[22] = vsubq_s32(x1[9], x1[22]);
- x2[10] = vaddq_s32(x1[10], x1[21]);
- x2[21] = vsubq_s32(x1[10], x1[21]);
- x2[11] = vaddq_s32(x1[11], x1[20]);
- x2[20] = vsubq_s32(x1[11], x1[20]);
- x2[12] = vaddq_s32(x1[12], x1[19]);
- x2[19] = vsubq_s32(x1[12], x1[19]);
- x2[13] = vaddq_s32(x1[13], x1[18]);
- x2[18] = vsubq_s32(x1[13], x1[18]);
- x2[14] = vaddq_s32(x1[14], x1[17]);
- x2[17] = vsubq_s32(x1[14], x1[17]);
- x2[15] = vaddq_s32(x1[15], x1[16]);
- x2[16] = vsubq_s32(x1[15], x1[16]);
+ butterfly_dct_pre(x1, x2, 32);
x2[32] = x1[32];
x2[33] = x1[33];
x2[34] = x1[34];
@@ -2567,23 +1758,14 @@ static void av1_fdct64_new_stage12345_neon(int32x4_t *input, const int instride,
x2[37] = x1[37];
x2[38] = x1[38];
x2[39] = x1[39];
-
- btf_32_neon_type0(-cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
- *v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
x2[56] = x1[56];
x2[57] = x1[57];
x2[58] = x1[58];
@@ -2595,126 +1777,43 @@ static void av1_fdct64_new_stage12345_neon(int32x4_t *input, const int instride,
// stage 3
int32x4_t x3[64];
- x3[0] = vaddq_s32(x2[0], x2[15]);
- x3[15] = vsubq_s32(x2[0], x2[15]);
- x3[1] = vaddq_s32(x2[1], x2[14]);
- x3[14] = vsubq_s32(x2[1], x2[14]);
- x3[2] = vaddq_s32(x2[2], x2[13]);
- x3[13] = vsubq_s32(x2[2], x2[13]);
- x3[3] = vaddq_s32(x2[3], x2[12]);
- x3[12] = vsubq_s32(x2[3], x2[12]);
- x3[4] = vaddq_s32(x2[4], x2[11]);
- x3[11] = vsubq_s32(x2[4], x2[11]);
- x3[5] = vaddq_s32(x2[5], x2[10]);
- x3[10] = vsubq_s32(x2[5], x2[10]);
- x3[6] = vaddq_s32(x2[6], x2[9]);
- x3[9] = vsubq_s32(x2[6], x2[9]);
- x3[7] = vaddq_s32(x2[7], x2[8]);
- x3[8] = vsubq_s32(x2[7], x2[8]);
+ butterfly_dct_pre(x2, x3, 16);
x3[16] = x2[16];
x3[17] = x2[17];
x3[18] = x2[18];
x3[19] = x2[19];
- btf_32_neon_type0(-cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
- *v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
x3[28] = x2[28];
x3[29] = x2[29];
x3[30] = x2[30];
x3[31] = x2[31];
- x3[32] = vaddq_s32(x2[32], x2[47]);
- x3[47] = vsubq_s32(x2[32], x2[47]);
- x3[33] = vaddq_s32(x2[33], x2[46]);
- x3[46] = vsubq_s32(x2[33], x2[46]);
- x3[34] = vaddq_s32(x2[34], x2[45]);
- x3[45] = vsubq_s32(x2[34], x2[45]);
- x3[35] = vaddq_s32(x2[35], x2[44]);
- x3[44] = vsubq_s32(x2[35], x2[44]);
- x3[36] = vaddq_s32(x2[36], x2[43]);
- x3[43] = vsubq_s32(x2[36], x2[43]);
- x3[37] = vaddq_s32(x2[37], x2[42]);
- x3[42] = vsubq_s32(x2[37], x2[42]);
- x3[38] = vaddq_s32(x2[38], x2[41]);
- x3[41] = vsubq_s32(x2[38], x2[41]);
- x3[39] = vaddq_s32(x2[39], x2[40]);
- x3[40] = vsubq_s32(x2[39], x2[40]);
- x3[48] = vsubq_s32(x2[63], x2[48]);
- x3[63] = vaddq_s32(x2[63], x2[48]);
- x3[49] = vsubq_s32(x2[62], x2[49]);
- x3[62] = vaddq_s32(x2[62], x2[49]);
- x3[50] = vsubq_s32(x2[61], x2[50]);
- x3[61] = vaddq_s32(x2[61], x2[50]);
- x3[51] = vsubq_s32(x2[60], x2[51]);
- x3[60] = vaddq_s32(x2[60], x2[51]);
- x3[52] = vsubq_s32(x2[59], x2[52]);
- x3[59] = vaddq_s32(x2[59], x2[52]);
- x3[53] = vsubq_s32(x2[58], x2[53]);
- x3[58] = vaddq_s32(x2[58], x2[53]);
- x3[54] = vsubq_s32(x2[57], x2[54]);
- x3[57] = vaddq_s32(x2[57], x2[54]);
- x3[55] = vsubq_s32(x2[56], x2[55]);
- x3[56] = vaddq_s32(x2[56], x2[55]);
+ butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
// stage 4
int32x4_t x4[64];
- x4[0] = vaddq_s32(x3[0], x3[7]);
- x4[7] = vsubq_s32(x3[0], x3[7]);
- x4[1] = vaddq_s32(x3[1], x3[6]);
- x4[6] = vsubq_s32(x3[1], x3[6]);
- x4[2] = vaddq_s32(x3[2], x3[5]);
- x4[5] = vsubq_s32(x3[2], x3[5]);
- x4[3] = vaddq_s32(x3[3], x3[4]);
- x4[4] = vsubq_s32(x3[3], x3[4]);
+ butterfly_dct_pre(x3, x4, 8);
x4[8] = x3[8];
x4[9] = x3[9];
- btf_32_neon_type0(-cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
- *v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
x4[14] = x3[14];
x4[15] = x3[15];
- x4[16] = vaddq_s32(x3[16], x3[23]);
- x4[23] = vsubq_s32(x3[16], x3[23]);
- x4[17] = vaddq_s32(x3[17], x3[22]);
- x4[22] = vsubq_s32(x3[17], x3[22]);
- x4[18] = vaddq_s32(x3[18], x3[21]);
- x4[21] = vsubq_s32(x3[18], x3[21]);
- x4[19] = vaddq_s32(x3[19], x3[20]);
- x4[20] = vsubq_s32(x3[19], x3[20]);
- x4[24] = vsubq_s32(x3[31], x3[24]);
- x4[31] = vaddq_s32(x3[31], x3[24]);
- x4[25] = vsubq_s32(x3[30], x3[25]);
- x4[30] = vaddq_s32(x3[30], x3[25]);
- x4[26] = vsubq_s32(x3[29], x3[26]);
- x4[29] = vaddq_s32(x3[29], x3[26]);
- x4[27] = vsubq_s32(x3[28], x3[27]);
- x4[28] = vaddq_s32(x3[28], x3[27]);
+ butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
x4[32] = x3[32];
x4[33] = x3[33];
x4[34] = x3[34];
x4[35] = x3[35];
-
- btf_32_neon_type0(-cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], x3[40], x3[55], x4[40], x4[55],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], x3[41], x3[54], x4[41], x4[54],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], x3[42], x3[53], x4[42], x4[53],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], x3[43], x3[52], x4[43], x4[52],
- *v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
x4[44] = x3[44];
x4[45] = x3[45];
x4[46] = x3[46];
@@ -2729,134 +1828,54 @@ static void av1_fdct64_new_stage12345_neon(int32x4_t *input, const int instride,
x4[63] = x3[63];
// stage 5
- x5[0] = vaddq_s32(x4[0], x4[3]);
- x5[3] = vsubq_s32(x4[0], x4[3]);
- x5[1] = vaddq_s32(x4[1], x4[2]);
- x5[2] = vsubq_s32(x4[1], x4[2]);
+ int32x4_t x5[64];
+ butterfly_dct_pre(x4, x5, 4);
x5[4] = x4[4];
-
- btf_32_neon_type0(-cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
- *v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
x5[7] = x4[7];
- x5[8] = vaddq_s32(x4[8], x4[11]);
- x5[11] = vsubq_s32(x4[8], x4[11]);
- x5[9] = vaddq_s32(x4[9], x4[10]);
- x5[10] = vsubq_s32(x4[9], x4[10]);
- x5[12] = vsubq_s32(x4[15], x4[12]);
- x5[15] = vaddq_s32(x4[15], x4[12]);
- x5[13] = vsubq_s32(x4[14], x4[13]);
- x5[14] = vaddq_s32(x4[14], x4[13]);
+ butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
x5[16] = x4[16];
x5[17] = x4[17];
-
- btf_32_neon_type0(-cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], x4[20], x4[27], x5[20], x5[27],
- *v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], x4[21], x4[26], x5[21], x5[26],
- *v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
x5[22] = x4[22];
x5[23] = x4[23];
x5[24] = x4[24];
x5[25] = x4[25];
x5[30] = x4[30];
x5[31] = x4[31];
- x5[32] = vaddq_s32(x4[32], x4[39]);
- x5[39] = vsubq_s32(x4[32], x4[39]);
- x5[33] = vaddq_s32(x4[33], x4[38]);
- x5[38] = vsubq_s32(x4[33], x4[38]);
- x5[34] = vaddq_s32(x4[34], x4[37]);
- x5[37] = vsubq_s32(x4[34], x4[37]);
- x5[35] = vaddq_s32(x4[35], x4[36]);
- x5[36] = vsubq_s32(x4[35], x4[36]);
- x5[40] = vsubq_s32(x4[47], x4[40]);
- x5[47] = vaddq_s32(x4[47], x4[40]);
- x5[41] = vsubq_s32(x4[46], x4[41]);
- x5[46] = vaddq_s32(x4[46], x4[41]);
- x5[42] = vsubq_s32(x4[45], x4[42]);
- x5[45] = vaddq_s32(x4[45], x4[42]);
- x5[43] = vsubq_s32(x4[44], x4[43]);
- x5[44] = vaddq_s32(x4[44], x4[43]);
- x5[48] = vaddq_s32(x4[48], x4[55]);
- x5[55] = vsubq_s32(x4[48], x4[55]);
- x5[49] = vaddq_s32(x4[49], x4[54]);
- x5[54] = vsubq_s32(x4[49], x4[54]);
- x5[50] = vaddq_s32(x4[50], x4[53]);
- x5[53] = vsubq_s32(x4[50], x4[53]);
- x5[51] = vaddq_s32(x4[51], x4[52]);
- x5[52] = vsubq_s32(x4[51], x4[52]);
- x5[56] = vsubq_s32(x4[63], x4[56]);
- x5[63] = vaddq_s32(x4[63], x4[56]);
- x5[57] = vsubq_s32(x4[62], x4[57]);
- x5[62] = vaddq_s32(x4[62], x4[57]);
- x5[58] = vsubq_s32(x4[61], x4[58]);
- x5[61] = vaddq_s32(x4[61], x4[58]);
- x5[59] = vsubq_s32(x4[60], x4[59]);
- x5[60] = vaddq_s32(x4[60], x4[59]);
-}
-
-static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
- int8_t cos_bit, const int instride,
- const int outstride) {
- const int32_t *cospi = cospi_arr(cos_bit);
- const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
- int startidx = 0 * instride;
- int endidx = 63 * instride;
-
- // stage 1-2-3-4-5
- int32x4_t x5[64];
- av1_fdct64_new_stage12345_neon(input, instride, x5, cospi, &v_cos_bit,
- &startidx, &endidx);
+ butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
// stage 6
int32x4_t x6[64];
- btf_32_neon_type0(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
- v_cos_bit);
- btf_32_neon_type1(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
- v_cos_bit);
- x6[4] = vaddq_s32(x5[4], x5[5]);
- x6[5] = vsubq_s32(x5[4], x5[5]);
- x6[6] = vsubq_s32(x5[7], x5[6]);
- x6[7] = vaddq_s32(x5[7], x5[6]);
+ butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
+ butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
x6[8] = x5[8];
- btf_32_neon_type0(-cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
- v_cos_bit);
- btf_32_neon_type0(-cospi[48], -cospi[16], x5[10], x5[13], x6[10], x6[13],
- v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
x6[11] = x5[11];
x6[12] = x5[12];
x6[15] = x5[15];
- x6[16] = vaddq_s32(x5[16], x5[19]);
- x6[19] = vsubq_s32(x5[16], x5[19]);
- x6[17] = vaddq_s32(x5[17], x5[18]);
- x6[18] = vsubq_s32(x5[17], x5[18]);
- x6[20] = vsubq_s32(x5[23], x5[20]);
- x6[23] = vaddq_s32(x5[23], x5[20]);
- x6[21] = vsubq_s32(x5[22], x5[21]);
- x6[22] = vaddq_s32(x5[22], x5[21]);
- x6[24] = vaddq_s32(x5[24], x5[27]);
- x6[27] = vsubq_s32(x5[24], x5[27]);
- x6[25] = vaddq_s32(x5[25], x5[26]);
- x6[26] = vsubq_s32(x5[25], x5[26]);
- x6[28] = vsubq_s32(x5[31], x5[28]);
- x6[31] = vaddq_s32(x5[31], x5[28]);
- x6[29] = vsubq_s32(x5[30], x5[29]);
- x6[30] = vaddq_s32(x5[30], x5[29]);
+ butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
x6[32] = x5[32];
x6[33] = x5[33];
-
- btf_32_neon_type0(-cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
- v_cos_bit);
- btf_32_neon_type0(-cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
- v_cos_bit);
- btf_32_neon_type0(-cospi[24], -cospi[40], x5[44], x5[51], x6[44], x6[51],
- v_cos_bit);
- btf_32_neon_type0(-cospi[24], -cospi[40], x5[45], x5[50], x6[45], x6[50],
- v_cos_bit);
-
+ butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
+ butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
+ butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
x6[46] = x5[46];
x6[47] = x5[47];
x6[48] = x5[48];
@@ -2874,82 +1893,26 @@ static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
x7[1] = x6[1];
x7[2] = x6[2];
x7[3] = x6[3];
- btf_32_neon_type1(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
- v_cos_bit);
-
- x7[8] = vaddq_s32(x6[8], x6[9]);
- x7[9] = vsubq_s32(x6[8], x6[9]);
- x7[10] = vsubq_s32(x6[11], x6[10]);
- x7[11] = vaddq_s32(x6[11], x6[10]);
- x7[12] = vaddq_s32(x6[12], x6[13]);
- x7[13] = vsubq_s32(x6[12], x6[13]);
- x7[14] = vsubq_s32(x6[15], x6[14]);
- x7[15] = vaddq_s32(x6[15], x6[14]);
+ butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
+ butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
+ butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
x7[16] = x6[16];
-
- btf_32_neon_type0(-cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
- v_cos_bit);
- btf_32_neon_type0(-cospi[24], -cospi[40], x6[22], x6[25], x7[22], x7[25],
- v_cos_bit);
+ butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
x7[23] = x6[23];
x7[24] = x6[24];
x7[27] = x6[27];
x7[28] = x6[28];
x7[31] = x6[31];
-
- btf_32_neon_type0(-cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
- v_cos_bit);
- btf_32_neon_type0(-cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
- v_cos_bit);
- btf_32_neon_type0(-cospi[56], -cospi[8], x5[36], x5[59], x6[36], x6[59],
- v_cos_bit);
- btf_32_neon_type0(-cospi[56], -cospi[8], x5[37], x5[58], x6[37], x6[58],
- v_cos_bit);
- x6[38] = x5[38];
- x6[39] = x5[39];
- x6[40] = x5[40];
- x6[41] = x5[41];
-
- btf_32_neon_type1(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
- btf_32_neon_type0(-cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
- v_cos_bit);
- btf_32_neon_type0(-cospi[56], -cospi[8], x6[18], x6[29], x7[18], x7[29],
- v_cos_bit);
- x7[19] = x6[19];
- x7[20] = x6[20];
-
- x7[32] = vaddq_s32(x6[32], x6[35]);
- x7[35] = vsubq_s32(x6[32], x6[35]);
- x7[33] = vaddq_s32(x6[33], x6[34]);
- x7[34] = vsubq_s32(x6[33], x6[34]);
- x7[36] = vsubq_s32(x6[39], x6[36]);
- x7[39] = vaddq_s32(x6[39], x6[36]);
- x7[37] = vsubq_s32(x6[38], x6[37]);
- x7[38] = vaddq_s32(x6[38], x6[37]);
- x7[40] = vaddq_s32(x6[40], x6[43]);
- x7[43] = vsubq_s32(x6[40], x6[43]);
- x7[41] = vaddq_s32(x6[41], x6[42]);
- x7[42] = vsubq_s32(x6[41], x6[42]);
- x7[44] = vsubq_s32(x6[47], x6[44]);
- x7[47] = vaddq_s32(x6[47], x6[44]);
- x7[45] = vsubq_s32(x6[46], x6[45]);
- x7[46] = vaddq_s32(x6[46], x6[45]);
- x7[48] = vaddq_s32(x6[48], x6[51]);
- x7[51] = vsubq_s32(x6[48], x6[51]);
- x7[49] = vaddq_s32(x6[49], x6[50]);
- x7[50] = vsubq_s32(x6[49], x6[50]);
- x7[52] = vsubq_s32(x6[55], x6[52]);
- x7[55] = vaddq_s32(x6[55], x6[52]);
- x7[53] = vsubq_s32(x6[54], x6[53]);
- x7[54] = vaddq_s32(x6[54], x6[53]);
- x7[56] = vaddq_s32(x6[56], x6[59]);
- x7[59] = vsubq_s32(x6[56], x6[59]);
- x7[57] = vaddq_s32(x6[57], x6[58]);
- x7[58] = vsubq_s32(x6[57], x6[58]);
- x7[60] = vsubq_s32(x6[63], x6[60]);
- x7[63] = vaddq_s32(x6[63], x6[60]);
- x7[61] = vsubq_s32(x6[62], x6[61]);
- x7[62] = vaddq_s32(x6[62], x6[61]);
+ butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
// stage 8
int32x4_t x8[64];
@@ -2962,54 +1925,29 @@ static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
x8[6] = x7[6];
x8[7] = x7[7];
- btf_32_neon_type1(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
- v_cos_bit);
- btf_32_neon_type1(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
- v_cos_bit);
- btf_32_neon_type1(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
- v_cos_bit);
- btf_32_neon_type1(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
- v_cos_bit);
- x8[16] = vaddq_s32(x7[16], x7[17]);
- x8[17] = vsubq_s32(x7[16], x7[17]);
- x8[18] = vsubq_s32(x7[19], x7[18]);
- x8[19] = vaddq_s32(x7[19], x7[18]);
- x8[20] = vaddq_s32(x7[20], x7[21]);
- x8[21] = vsubq_s32(x7[20], x7[21]);
- x8[22] = vsubq_s32(x7[23], x7[22]);
- x8[23] = vaddq_s32(x7[23], x7[22]);
- x8[24] = vaddq_s32(x7[24], x7[25]);
- x8[25] = vsubq_s32(x7[24], x7[25]);
- x8[26] = vsubq_s32(x7[27], x7[26]);
- x8[27] = vaddq_s32(x7[27], x7[26]);
- x8[28] = vaddq_s32(x7[28], x7[29]);
- x8[29] = vsubq_s32(x7[28], x7[29]);
- x8[30] = vsubq_s32(x7[31], x7[30]);
- x8[31] = vaddq_s32(x7[31], x7[30]);
+ butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
+ butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
+ butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
+ butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
+ butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
x8[32] = x7[32];
-
- btf_32_neon_type0(-cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
- v_cos_bit);
- btf_32_neon_type0(-cospi[60], -cospi[4], x7[34], x7[61], x8[34], x8[61],
- v_cos_bit);
+ butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
+ butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
x8[35] = x7[35];
x8[36] = x7[36];
- btf_32_neon_type0(-cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
- v_cos_bit);
- btf_32_neon_type0(-cospi[28], -cospi[36], x7[38], x7[57], x8[38], x8[57],
- v_cos_bit);
+ butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
+ butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
x8[39] = x7[39];
x8[40] = x7[40];
- btf_32_neon_type0(-cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
- v_cos_bit);
- btf_32_neon_type0(-cospi[44], -cospi[20], x7[42], x7[53], x8[42], x8[53],
- v_cos_bit);
+ butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
+ butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
x8[43] = x7[43];
x8[44] = x7[44];
- btf_32_neon_type0(-cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
- v_cos_bit);
- btf_32_neon_type0(-cospi[12], -cospi[52], x7[46], x7[49], x8[46], x8[49],
- v_cos_bit);
+ butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
+ butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
x8[47] = x7[47];
x8[48] = x7[48];
x8[51] = x7[51];
@@ -3038,56 +1976,22 @@ static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
x9[13] = x8[13];
x9[14] = x8[14];
x9[15] = x8[15];
-
- btf_32_neon_type1(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
- v_cos_bit);
- btf_32_neon_type1(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
- v_cos_bit);
- btf_32_neon_type1(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
- v_cos_bit);
- btf_32_neon_type1(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
- v_cos_bit);
- btf_32_neon_type1(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
- v_cos_bit);
- btf_32_neon_type1(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
- v_cos_bit);
- btf_32_neon_type1(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
- v_cos_bit);
- btf_32_neon_type1(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
- v_cos_bit);
-
- x9[32] = vaddq_s32(x8[32], x8[33]);
- x9[33] = vsubq_s32(x8[32], x8[33]);
- x9[34] = vsubq_s32(x8[35], x8[34]);
- x9[35] = vaddq_s32(x8[35], x8[34]);
- x9[36] = vaddq_s32(x8[36], x8[37]);
- x9[37] = vsubq_s32(x8[36], x8[37]);
- x9[38] = vsubq_s32(x8[39], x8[38]);
- x9[39] = vaddq_s32(x8[39], x8[38]);
- x9[40] = vaddq_s32(x8[40], x8[41]);
- x9[41] = vsubq_s32(x8[40], x8[41]);
- x9[42] = vsubq_s32(x8[43], x8[42]);
- x9[43] = vaddq_s32(x8[43], x8[42]);
- x9[44] = vaddq_s32(x8[44], x8[45]);
- x9[45] = vsubq_s32(x8[44], x8[45]);
- x9[46] = vsubq_s32(x8[47], x8[46]);
- x9[47] = vaddq_s32(x8[47], x8[46]);
- x9[48] = vaddq_s32(x8[48], x8[49]);
- x9[49] = vsubq_s32(x8[48], x8[49]);
- x9[50] = vsubq_s32(x8[51], x8[50]);
- x9[51] = vaddq_s32(x8[51], x8[50]);
- x9[52] = vaddq_s32(x8[52], x8[53]);
- x9[53] = vsubq_s32(x8[52], x8[53]);
- x9[54] = vsubq_s32(x8[55], x8[54]);
- x9[55] = vaddq_s32(x8[55], x8[54]);
- x9[56] = vaddq_s32(x8[56], x8[57]);
- x9[57] = vsubq_s32(x8[56], x8[57]);
- x9[58] = vsubq_s32(x8[59], x8[58]);
- x9[59] = vaddq_s32(x8[59], x8[58]);
- x9[60] = vaddq_s32(x8[60], x8[61]);
- x9[61] = vsubq_s32(x8[60], x8[61]);
- x9[62] = vsubq_s32(x8[63], x8[62]);
- x9[63] = vaddq_s32(x8[63], x8[62]);
+ butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
+ butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
+ butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
+ butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
+ butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
+ butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
+ butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
+ butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
+ butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
// stage 10
int32x4_t x10[64];
@@ -3123,903 +2027,593 @@ static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
x10[29] = x9[29];
x10[30] = x9[30];
x10[31] = x9[31];
- btf_32_neon_type1(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
- v_cos_bit);
- btf_32_neon_type1(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
- v_cos_bit);
- btf_32_neon_type1(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
- v_cos_bit);
- btf_32_neon_type1(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
- v_cos_bit);
- btf_32_neon_type1(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
- v_cos_bit);
- btf_32_neon_type1(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
- v_cos_bit);
- btf_32_neon_type1(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
- v_cos_bit);
- btf_32_neon_type1(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
- v_cos_bit);
- btf_32_neon_type1(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
- v_cos_bit);
- btf_32_neon_type1(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
- v_cos_bit);
- btf_32_neon_type1(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
- v_cos_bit);
- btf_32_neon_type1(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
- v_cos_bit);
- btf_32_neon_type1(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
- v_cos_bit);
- btf_32_neon_type1(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
- v_cos_bit);
- btf_32_neon_type1(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
- v_cos_bit);
- btf_32_neon_type1(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
- v_cos_bit);
-
- startidx = 0 * outstride;
- endidx = 63 * outstride;
- // stage 11
- output[startidx] = x10[0];
- output[endidx] = x10[63];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[32];
- output[endidx] = x10[31];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[16];
- output[endidx] = x10[47];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[48];
- output[endidx] = x10[15];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[8];
- output[endidx] = x10[55];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[40];
- output[endidx] = x10[23];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[24];
- output[endidx] = x10[39];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[56];
- output[endidx] = x10[7];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[4];
- output[endidx] = x10[59];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[36];
- output[endidx] = x10[27];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[20];
- output[endidx] = x10[43];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[52];
- output[endidx] = x10[11];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[12];
- output[endidx] = x10[51];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[44];
- output[endidx] = x10[19];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[28];
- output[endidx] = x10[35];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[60];
- output[endidx] = x10[3];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[2];
- output[endidx] = x10[61];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[34];
- output[endidx] = x10[29];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[18];
- output[endidx] = x10[45];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[50];
- output[endidx] = x10[13];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[10];
- output[endidx] = x10[53];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[42];
- output[endidx] = x10[21];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[26];
- output[endidx] = x10[37];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[58];
- output[endidx] = x10[5];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[6];
- output[endidx] = x10[57];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[38];
- output[endidx] = x10[25];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[22];
- output[endidx] = x10[41];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[54];
- output[endidx] = x10[9];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[14];
- output[endidx] = x10[49];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[46];
- output[endidx] = x10[17];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[30];
- output[endidx] = x10[33];
- startidx += outstride;
- endidx -= outstride;
- output[startidx] = x10[62];
- output[endidx] = x10[1];
-}
+ butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
+ butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
+ butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
+ butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
+ butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
+ butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
+ butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
+ butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
+ butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
+ butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
+ butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
+ butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
+ butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
+ butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
+ butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
+ butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
-void av1_idtx32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
- const int col_num) {
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
+
+static void highbd_fidentity32_x4_neon(const int32x4_t *input,
+ int32x4_t *output, int cos_bit) {
(void)cos_bit;
for (int i = 0; i < 32; i++) {
- output[i * col_num] = vshlq_n_s32(input[i * col_num], 2);
+ output[i] = vshlq_n_s32(input[i], 2);
}
}
-static const fwd_transform_1d_neon col_highbd_txfm8x32_arr[TX_TYPES] = {
- av1_fdct32_new_neon, // DCT_DCT
- NULL, // ADST_DCT
- NULL, // DCT_ADST
- NULL, // ADST_ADST
- NULL, // FLIPADST_DCT
- NULL, // DCT_FLIPADST
- NULL, // FLIPADST_FLIPADST
- NULL, // ADST_FLIPADST
- NULL, // FLIPADST_ADST
- av1_idtx32_new_neon, // IDTX
- NULL, // V_DCT
- NULL, // H_DCT
- NULL, // V_ADST
- NULL, // H_ADST
- NULL, // V_FLIPADST
- NULL // H_FLIPADST
-};
-
-static const fwd_transform_1d_neon row_highbd_txfm8x32_arr[TX_TYPES] = {
- fdct16x16_neon, // DCT_DCT
- NULL, // ADST_DCT
- NULL, // DCT_ADST
- NULL, // ADST_ADST
- NULL, // FLIPADST_DCT
- NULL, // DCT_FLIPADST
- NULL, // FLIPADST_FLIPADST
- NULL, // ADST_FLIPADST
- NULL, // FLIPADST_ADST
- idtx16x16_neon, // IDTX
- NULL, // V_DCT
- NULL, // H_DCT
- NULL, // V_ADST
- NULL, // H_ADST
- NULL, // V_FLIPADST
- NULL // H_FLIPADST
-};
+TRANSFORM_COL_MANY(fdct32, 32)
+TRANSFORM_COL_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_col_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_col_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+TRANSFORM_ROW_MANY(fdct32, 32)
+TRANSFORM_ROW_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_row_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_row_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+TRANSFORM_ROW_RECT_MANY(fdct32, 32)
+TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_row_rect_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_row_rect_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
(void)bd;
- int32x4_t in[32], out[32];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
- const int txw_idx = get_txw_idx(TX_16X8);
- const int txh_idx = get_txh_idx(TX_16X8);
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x8_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
- int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm8_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm16_xn_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[2][1];
+
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
- for (int i = 0; i < 2; i++) {
- load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
- col_txfm(in, in, bit, 2);
- col_txfm_8x8_rounding(in, &v_shift1);
- transpose_8x8(in, out + i * 16);
- }
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ // Column-wise transform.
+ int32x4_t buf0[32];
if (lr_flip) {
- flip_buf_neon(in, out, 32);
- row_txfm(in, out, bit, 2);
+ col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
+ /*hm_stride=*/-8);
} else {
- row_txfm(out, out, bit, 2);
+ col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/8);
}
+ shift_right_2_round_s32_x4(buf0, buf0, 32);
- for (int i = 0; i < 2; i++) {
- av1_round_shift_rect_array_32_neon(out + i * 16, in, 16, -shift[2],
- NewSqrt2);
- write_buffer_8x8(in, coeff + i * 64);
- }
+ int32x4_t buf1[32];
+ transpose_arrays_s32_16x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
}
void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
(void)bd;
- int32x4_t in[32], out[32];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
- const int txw_idx = get_txw_idx(TX_8X16);
- const int txh_idx = get_txh_idx(TX_8X16);
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x8_arr[tx_type];
- int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm8_xn_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[1][2];
+
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
- load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
- col_txfm(in, in, bit, 2);
- const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
- col_txfm_8x16_rounding(in, &v_shift1);
- transpose_8x8(in, out);
- transpose_8x8(in + 16, out + 16);
-
- for (int i = 0; i < 2; i++) {
- row_txfm(out + i * 16, out, bit, 2);
- av1_round_shift_rect_array_32_neon(out, out, 16, -shift[2], NewSqrt2);
- write_buffer_16x8(out, coeff + i * 8, 16);
+ // Column-wise transform.
+ int32x4_t buf0[32];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
+ /*hm_stride=*/-16);
+ } else {
+ col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/16);
}
-}
+ shift_right_2_round_s32_x4(buf0, buf0, 32);
-static INLINE void transpose_8nx8n(const int32x4_t *input, int32x4_t *output,
- const int width, const int height) {
- const int numcol = height >> 2;
- const int numrow = width >> 2;
- for (int j = 0; j < numrow; j++) {
- for (int i = 0; i < numcol; i++) {
- TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
- input[i * width + j + (numrow * 1)],
- input[i * width + j + (numrow * 2)],
- input[i * width + j + (numrow * 3)],
- output[j * height + i + (numcol * 0)],
- output[j * height + i + (numcol * 1)],
- output[j * height + i + (numcol * 2)],
- output[j * height + i + (numcol * 3)]);
- }
- }
+ int32x4_t buf1[32];
+ transpose_arrays_s32_8x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
}
#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
(void)bd;
-
- int32x4_t in[16];
- int32x4_t *outcoeff128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
- const int txw_idx = get_txw_idx(TX_4X16);
- const int txh_idx = get_txh_idx(TX_4X16);
- const int txfm_size_col = tx_size_wide[TX_4X16];
- const int txfm_size_row = tx_size_high[TX_4X16];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[0][2];
+ int bitrow = av1_fwd_cos_bit_row[0][2];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm4_xn_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- // col transform
- int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
- load_buffer_4x16(input, in, stride, ud_flip, lr_flip, &v_shift0);
- col_txfm(in, outcoeff128, bitcol, 1);
- const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
- col_txfm_8x8_rounding(outcoeff128, &v_shift1);
- transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
-
- // row transform
- for (int i = 0; i < txfm_size_col; i++) {
- int32x4_t tmp[4];
- row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
- store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[16];
+ if (lr_flip) {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
+ /*hm_stride=*/0);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
+ /*hm_stride=*/0);
}
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+
+ int32x4_t buf1[16];
+ transpose_arrays_s32_4x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
}
#endif
void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
(void)bd;
+ int bitcol = av1_fwd_cos_bit_col[2][0];
+ int bitrow = av1_fwd_cos_bit_row[2][0];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm4_xn_arr[tx_type];
+ const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
- int32x4_t in[16];
- int32x4_t *outcoeff128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
- const int txw_idx = get_txw_idx(TX_16X4);
- const int txh_idx = get_txh_idx(TX_16X4);
- const int txfm_size_col = tx_size_wide[TX_16X4];
- const int txfm_size_row = tx_size_high[TX_16X4];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
- // col transform
- const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
- load_buffer_16x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
-
- for (int i = 0; i < (txfm_size_col >> 2); i++) {
- int32x4_t *cur_in = &in[i * txfm_size_row];
- col_txfm(cur_in, cur_in, bitcol, 1);
- transpose_4x4(cur_in, cur_in);
+ // Column-wise transform.
+ int32x4_t buf0[16];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
+ /*hm_stride=*/-4);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/4);
}
- const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
- col_txfm_8x8_rounding(in, &v_shift1);
- // row transform
- row_txfm(in, outcoeff128, bitrow, 1);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_4x16(buf0, buf0);
+
+ // Row-wise transform.
+ row_txfm(buf0, coeff, bitrow, /*stride=*/4);
}
void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
(void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm16_xn_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[2][3];
+ int bitrow = av1_fwd_cos_bit_row[2][3];
- int32x4_t in[128];
- int32x4_t *outcoef128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
- const int txw_idx = get_txw_idx(TX_16X32);
- const int txh_idx = get_txh_idx(TX_16X32);
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x32_arr[tx_type];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ // Column-wise transform.
+ int32x4_t buf0[128];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/32);
+ shift_right_4_round_s32_x4(buf0, buf0, 128);
- // column transform
- load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+ int32x4_t buf1[128];
+ transpose_arrays_s32_16x32(buf0, buf1);
- for (int i = 0; i < 4; i++) {
- col_txfm((in + i), (in + i), bitcol, 4);
- }
-
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
- col_txfm_16x16_rounding(&in[0], &v_shift);
- col_txfm_16x16_rounding(&in[64], &v_shift);
- transpose_8nx8n(in, outcoef128, 16, 32);
-
- // row transform
- row_txfm(outcoef128, in, bitrow, 8);
- av1_round_shift_rect_array_32_neon(in, outcoef128, 128, -shift[2], NewSqrt2);
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
}
void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- (void)tx_type;
(void)bd;
+ (void)tx_type;
+ int bitcol = av1_fwd_cos_bit_col[3][4];
+ int bitrow = av1_fwd_cos_bit_row[3][4];
- int32x4_t in[512];
- int32x4_t *outcoef128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
- const int txw_idx = get_txw_idx(TX_32X64);
- const int txh_idx = get_txh_idx(TX_32X64);
- const int txfm_size_col = tx_size_wide[TX_32X64];
- const int txfm_size_row = tx_size_high[TX_32X64];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int num_row = txfm_size_row >> 2;
- const int num_col = txfm_size_col >> 2;
-
- // column transform
- load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
- for (int i = 0; i < num_col; i++) {
- av1_fdct64_new_neon((in + i), (in + i), bitcol, num_col, num_col);
+ // Column-wise transform.
+ int32x4_t buf0[512];
+ load_buffer_32x64(input, buf0, stride, 0);
+ for (int i = 0; i < 8; i++) {
+ highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
}
+ shift_right_2_round_s32_x4(buf0, buf0, 512);
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
- for (int i = 0; i < num_col; i++) {
- col_txfm_16x16_rounding((in + i * txfm_size_row), &v_shift);
- }
- transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+ int32x4_t buf1[512];
+ transpose_arrays_s32_32x64(buf0, buf1);
- // row transform
- for (int i = 0; i < num_row; i++) {
- av1_fdct32_new_neon((outcoef128 + i), (in + i), bitrow, num_row);
- }
- for (int i = 0; i < txfm_size_col; i++) {
- av1_round_shift_rect_array_32_neon(in + i * 16, outcoef128 + i * 8, 8,
- -shift[2], NewSqrt2);
+ // Row-wise transform.
+ for (int i = 0; i < 16; i++) {
+ highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
}
+ round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+ store_buffer_32x32(buf1, coeff, /*stride=*/32);
}
void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
+ (void)bd;
(void)tx_type;
- int32x4_t in[512];
- int32x4_t *outcoef128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
- const int txw_idx = get_txw_idx(TX_64X32);
- const int txh_idx = get_txh_idx(TX_64X32);
- const int txfm_size_col = tx_size_wide[TX_64X32];
- const int txfm_size_row = tx_size_high[TX_64X32];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const int num_row = txfm_size_row >> 2;
- const int num_col = txfm_size_col >> 2;
-
- // column transform
- const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
- for (int i = 0; i < 32; i++) {
- load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0,
- &v_shift0);
- load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
- &v_shift0);
- load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
- &v_shift0);
- load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
- &v_shift0);
+ int bitcol = av1_fwd_cos_bit_col[4][3];
+ int bitrow = av1_fwd_cos_bit_row[4][3];
+
+ // Column-wise transform.
+ int32x4_t buf0[512];
+ load_buffer_64x32(input, buf0, stride, 0);
+ for (int i = 0; i < 16; i++) {
+ highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
}
+ shift_right_4_round_s32_x4(buf0, buf0, 512);
- for (int i = 0; i < num_col; i++) {
- av1_fdct32_new_neon((in + i), (in + i), bitcol, num_col);
- }
-
- const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
- for (int i = 0; i < num_row; i++) {
- col_txfm_16x16_rounding((in + i * txfm_size_col), &v_shift1);
- }
- transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+ int32x4_t buf1[512];
+ transpose_arrays_s32_64x32(buf0, buf1);
- // row transform
- for (int i = 0; i < num_row; i++) {
- av1_fdct64_new_neon((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+ // Row-wise transform.
+ for (int i = 0; i < 8; i++) {
+ highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
}
- av1_round_shift_rect_array_32_neon(in, outcoef128, 512, -shift[2], NewSqrt2);
- (void)bd;
+ round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+ store_buffer_64x32(buf1, coeff, /*stride=*/32);
}
void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[128];
- int32x4_t *outcoef128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
- const int txw_idx = get_txw_idx(TX_32X16);
- const int txh_idx = get_txh_idx(TX_32X16);
- const fwd_transform_1d_neon col_txfm = row_highbd_txfm8x32_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-
- // column transform
- load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
- col_txfm(in, in, bitcol, 8);
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
- col_txfm_16x16_rounding(&in[0], &v_shift);
- col_txfm_16x16_rounding(&in[64], &v_shift);
- transpose_8nx8n(in, outcoef128, 32, 16);
-
- // row transform
- for (int i = 0; i < 4; i++) {
- row_txfm((outcoef128 + i), (in + i), bitrow, 4);
- }
- av1_round_shift_rect_array_32_neon(in, outcoef128, 128, -shift[2], NewSqrt2);
(void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm32_x4_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[3][2];
+ int bitrow = av1_fwd_cos_bit_row[3][2];
+
+ // Column-wise transform.
+ int32x4_t buf0[128];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/16);
+ shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+ int32x4_t buf1[128];
+ transpose_arrays_s32_32x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
}
#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[64];
- int32x4_t *outcoef128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
- const int txw_idx = get_txw_idx(TX_8X32);
- const int txh_idx = get_txh_idx(TX_8X32);
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm32x8_arr[tx_type];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-
- const int txfm_size_col = tx_size_wide[TX_8X32];
- const int txfm_size_row = tx_size_high[TX_8X32];
- const int num_col = txfm_size_col >> 2;
-
- // column transform
- load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
- load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
- stride, 0, 0, shift[0]);
-
- for (int i = 0; i < num_col; i++) {
- col_txfm((in + i), (in + i), bitcol, num_col);
- }
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm8_xn_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[1][3];
+ int bitrow = av1_fwd_cos_bit_row[1][3];
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
- col_txfm_16x16_rounding(in, &v_shift);
- transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+ // Column-wise transform.
+ int32x4_t buf0[64];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/32);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
- // row transform
- for (int i = 0; i < txfm_size_col; i += 2) {
- row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
- }
- (void)bd;
+ int32x4_t buf1[64];
+ transpose_arrays_s32_8x32(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
}
void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[64];
- int32x4_t *outcoef128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
- const int txw_idx = get_txw_idx(TX_32X8);
- const int txh_idx = get_txh_idx(TX_32X8);
- const fwd_transform_1d_neon col_txfm = row_highbd_txfm32x8_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-
- const int txfm_size_col = tx_size_wide[TX_32X8];
- const int txfm_size_row = tx_size_high[TX_32X8];
- const int num_col = txfm_size_row >> 2;
-
- // column transform
- load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
- for (int i = 0; i < txfm_size_row; i += 2) {
- col_txfm((in + i), (in + i), bitcol, txfm_size_row);
- }
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm8_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm32_x4_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[3][1];
+ int bitrow = av1_fwd_cos_bit_row[3][1];
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
- col_txfm_16x16_rounding(&in[0], &v_shift);
- transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+ // Column-wise transform.
+ int32x4_t buf0[64];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/8);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
- // row transform
- for (int i = 0; i < num_col; i++) {
- row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
- }
- (void)bd;
+ int32x4_t buf1[64];
+ transpose_arrays_s32_32x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
}
#endif
void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[8];
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
- const int txw_idx = get_txw_idx(TX_4X8);
- const int txh_idx = get_txh_idx(TX_4X8);
- const int txfm_size_col = tx_size_wide[TX_4X8];
- const int txfm_size_row = tx_size_high[TX_4X8];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x8_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+ (void)bd;
+ int bitcol = av1_fwd_cos_bit_col[0][1];
+ int bitrow = av1_fwd_cos_bit_row[0][1];
+ const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm4_xn_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
- load_buffer_4x8(input, in, stride, ud_flip, lr_flip, &v_shift0);
- col_txfm(in, in, bitcol, 1);
- int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
- col_txfm_4x8_rounding(in, &v_shift1);
-
- for (int i = 0; i < 2; i++) {
- int32x4_t *cur_in = &in[i * 4];
- transpose_4x4(cur_in, cur_in);
- row_txfm(cur_in, cur_in, bitrow, 1);
- av1_round_shift_rect_array_32_neon(cur_in, cur_in, txfm_size_col, -shift[2],
- NewSqrt2);
- store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4);
- }
- (void)bd;
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Column-wise transform.
+ int32x4_t buf0[8];
+ col_txfm(input, buf0, stride, bitcol, lr_flip);
+ shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+ int32x4_t buf1[8];
+ transpose_arrays_s32_4x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
}
void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[8];
- int32x4_t *outcoeff128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
- const int txw_idx = get_txw_idx(TX_8X4);
- const int txh_idx = get_txh_idx(TX_8X4);
- const int txfm_size_col = tx_size_wide[TX_8X4];
- const int txfm_size_row = tx_size_high[TX_8X4];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
- const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
- const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x8_arr[tx_type];
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[1][0];
+ const int bitrow = av1_fwd_cos_bit_row[1][0];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm4_xn_arr[tx_type];
+ const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
+
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- // col tranform
- int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
- load_buffer_8x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
- for (int i = 0; i < 2; i++) {
- int32x4_t *cur_in = &in[i * txfm_size_row];
- col_txfm(cur_in, cur_in, bitcol, 1);
- transpose_4x4(cur_in, cur_in);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+ // Column-wise transform.
+ int32x4_t buf0[8];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
+ /*hm_stride=*/-4);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/4);
}
- int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
- col_txfm_4x8_rounding(in, &v_shift1);
- // row tranform
- row_txfm(in, outcoeff128, bitrow, 1);
- av1_round_shift_rect_array_32_neon(outcoeff128, outcoeff128, txfm_size_col,
- -shift[2], NewSqrt2);
- (void)bd;
+ shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+ int32x4_t buf1[8];
+ transpose_arrays_s32_8x4(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*stride=*/4);
}
#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[256];
- int32x4_t *outcoeff128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
- const int txw_idx = get_txw_idx(TX_16X64);
- const int txh_idx = get_txh_idx(TX_16X64);
- const int txfm_size_col = tx_size_wide[TX_16X64];
- const int txfm_size_row = tx_size_high[TX_16X64];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[2][4];
+ const int bitrow = av1_fwd_cos_bit_row[2][4];
+
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- const int num_col = txfm_size_col >> 2;
- // col tranform
- const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
- for (int i = 0; i < txfm_size_row; i += num_col) {
- load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
- ud_flip, lr_flip, &v_shift0);
- load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
- ud_flip, lr_flip, &v_shift0);
- load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
- ud_flip, lr_flip, &v_shift0);
- load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
- ud_flip, lr_flip, &v_shift0);
- }
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
- for (int i = 0; i < num_col; i++) {
- av1_fdct64_new_neon(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ load_buffer_16x64(input, buf0, stride, lr_flip);
+ for (int i = 0; i < 4; i++) {
+ highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
}
+ shift_right_2_round_s32_x4(buf0, buf0, 256);
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
- col_txfm_16x16_rounding(outcoeff128, &v_shift);
- col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
- col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
- col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
+ int32x4_t buf1[256];
+ transpose_arrays_s32_16x64(buf0, buf1);
- transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
- fdct16x16_neon(in, outcoeff128, bitrow, 8);
- (void)bd;
+ // Row-wise transform.
+ highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
+ store_buffer_16x32(buf1, coeff, /*stride=*/32);
}
void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
- int32x4_t in[256];
- int32x4_t *outcoeff128 = (int32x4_t *)coeff;
- const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
- const int txw_idx = get_txw_idx(TX_64X16);
- const int txh_idx = get_txh_idx(TX_64X16);
- const int txfm_size_col = tx_size_wide[TX_64X16];
- const int txfm_size_row = tx_size_high[TX_64X16];
- int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
- int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[4][2];
+ const int bitrow = av1_fwd_cos_bit_row[4][2];
+
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- // col tranform
- const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
- for (int i = 0; i < txfm_size_row; i++) {
- load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
- ud_flip, lr_flip, &v_shift0);
- load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
- ud_flip, lr_flip, &v_shift0);
- load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
- ud_flip, lr_flip, &v_shift0);
- load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
- ud_flip, lr_flip, &v_shift0);
- }
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ load_buffer_64x16(input, buf0, stride, lr_flip);
+ highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
+ shift_right_4_round_s32_x4(buf0, buf0, 256);
- fdct16x16_neon(in, outcoeff128, bitcol, txfm_size_row);
- const int32x4_t v_shift = vdupq_n_s32(shift[1]);
- col_txfm_16x16_rounding(outcoeff128, &v_shift);
- col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
- col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
- col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
+ int32x4_t buf1[256];
+ transpose_arrays_s32_64x16(buf0, buf1);
- transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+ // Row-wise transform.
for (int i = 0; i < 4; i++) {
- av1_fdct64_new_neon(in + i, outcoeff128 + i, bitrow, 4, 4);
+ highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
}
- memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
- (void)bd;
+ store_buffer_64x16(buf1, coeff, /*stride=*/16);
+ memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
}
#endif
-static void fdct64_new_neon(int32x4_t *input, int32x4_t *output,
- const int8_t cos_bit, const int8_t *stage_range) {
- const int txfm_size = 64;
- const int num_per_128 = 4;
- int col_num = txfm_size / num_per_128;
- (void)stage_range;
- for (int col = 0; col < col_num; col++) {
- av1_fdct64_new_neon((input + col), (output + col), cos_bit, col_num,
- col_num);
- }
-}
+void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm32_x4_arr[tx_type];
-static void fdct32_new_neon(int32x4_t *input, int32x4_t *output,
- const int8_t cos_bit, const int8_t *stage_range) {
- const int txfm_size = 32;
- const int num_per_128 = 4;
- int col_num = txfm_size / num_per_128;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- av1_fdct32_new_neon((input + col), (output + col), cos_bit, col_num);
- }
-}
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/32);
+ shift_right_4_round_s32_x4(buf0, buf0, 256);
-static void idtx32x32_neon(int32x4_t *input, int32x4_t *output,
- const int8_t cos_bit, const int8_t *stage_range) {
- (void)stage_range;
+ int32x4_t buf1[256];
+ transpose_arrays_s32_32x32(buf0, buf1);
- for (int i = 0; i < 8; i++) {
- av1_idtx32_new_neon(&input[i * 32], &output[i * 32], cos_bit, 1);
- }
+ // Row-wise transform.
+ row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
+ /*stride=*/32);
}
-typedef void (*TxfmFuncNEON)(int32x4_t *input, int32x4_t *output,
- const int8_t cos_bit, const int8_t *stage_range);
+void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
-static INLINE TxfmFuncNEON fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
- switch (txfm_type) {
- case TXFM_TYPE_DCT32: return fdct32_new_neon;
- case TXFM_TYPE_DCT64: return fdct64_new_neon;
- case TXFM_TYPE_IDENTITY32: return idtx32x32_neon;
- default: assert(0);
+ // Column-wise transform.
+ int32x4_t buf0[1024];
+ load_buffer_64x64(input, buf0, stride, 0);
+ for (int col = 0; col < 16; col++) {
+ highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
}
- return NULL;
-}
+ shift_right_2_round_s32_x4(buf0, buf0, 1024);
-static INLINE void int16_array_with_stride_to_int32_array_without_stride(
- const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
- int r, c;
- for (r = 0; r < txfm1d_size; r++) {
- for (c = 0; c < txfm1d_size; c++) {
- output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
- }
- }
-}
+ int32x4_t buf1[1024];
+ transpose_arrays_s32_64x64(buf0, buf1);
-static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
- int32x4_t *output,
- const int size,
- const int bit) {
- const int32x4_t v_bit = vdupq_n_s32(-bit);
- for (int i = 0; i < size; i++) output[i] = vrshlq_s32(input[i], v_bit);
-}
-
-static INLINE void transpose_32_4x4(int stride, const int32x4_t *input,
- int32x4_t *output) {
- int32x4x2_t temp01 = vzipq_s32(input[0 * stride], input[2 * stride]);
- int32x4x2_t temp23 = vzipq_s32(input[1 * stride], input[3 * stride]);
-
- const int32x4x2_t output01 = vzipq_s32(temp01.val[0], temp23.val[0]);
- const int32x4x2_t output23 = vzipq_s32(temp01.val[1], temp23.val[1]);
-
- output[0 * stride] = output01.val[0];
- output[1 * stride] = output01.val[1];
- output[2 * stride] = output23.val[0];
- output[3 * stride] = output23.val[1];
-}
-
-static INLINE void transpose_32(int txfm_size, const int32x4_t *input,
- int32x4_t *output) {
- const int num_per_128 = 4;
- const int row_size = txfm_size;
- const int col_size = txfm_size / num_per_128;
- int r, c;
-
- // transpose each 4x4 block internally
- for (r = 0; r < row_size; r += 4) {
- for (c = 0; c < col_size; c++) {
- transpose_32_4x4(col_size, &input[r * col_size + c],
- &output[c * 4 * col_size + r / 4]);
- }
- }
-}
-
-static INLINE void fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
- const int stride,
- const TXFM_2D_FLIP_CFG *cfg,
- int32_t *txfm_buf) {
- assert(cfg->tx_size < TX_SIZES);
- const int txfm_size = tx_size_wide[cfg->tx_size];
- const int8_t *shift = cfg->shift;
- const int8_t *stage_range_col = cfg->stage_range_col;
- const int8_t cos_bit_col = cfg->cos_bit_col;
- const int8_t cos_bit_row = cfg->cos_bit_row;
- const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
- int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
- int32x4_t *out_128 = (int32x4_t *)output;
-
- const int num_per_128 = 4;
- int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
- int col_num = txfm_size / num_per_128;
-
- int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
- txfm_size);
- /*col wise transform*/
- txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
- av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
- transpose_32(txfm_size, out_128, buf_128);
-
- /*row wise transform*/
- for (int col = 0; col < (col_num >> 1); col++) {
- av1_fdct64_new_neon((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
- (col_num >> 1));
+ // Row-wise transform.
+ for (int col = 0; col < 8; col++) {
+ highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
}
-
- txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
- av1_round_shift_array_32_neon(out_128, out_128, txfm2d_size_128, -shift[2]);
-}
-
-static INLINE void fwd_txfm2d_neon(const int16_t *input, int32_t *output,
- const int stride,
- const TXFM_2D_FLIP_CFG *cfg,
- int32_t *txfm_buf) {
- assert(cfg->tx_size < TX_SIZES);
- const int txfm_size = tx_size_wide[cfg->tx_size];
- const int8_t *shift = cfg->shift;
- const int8_t *stage_range_col = cfg->stage_range_col;
- const int8_t *stage_range_row = cfg->stage_range_row;
- const int8_t cos_bit_col = cfg->cos_bit_col;
- const int8_t cos_bit_row = cfg->cos_bit_row;
- const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
- const TxfmFuncNEON txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
-
- int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
- int32x4_t *out_128 = (int32x4_t *)output;
- int num_per_128 = 4;
- int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
-
- int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
- txfm_size);
- av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[0]);
- txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
- av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
- transpose_32(txfm_size, out_128, buf_128);
- txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
- av1_round_shift_array_32_neon(out_128, out_128, txfm2d_size_128, -shift[2]);
-}
-
-void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
- TXFM_2D_FLIP_CFG cfg;
- av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
- (void)bd;
- fwd_txfm2d_neon(input, output, stride, &cfg, txfm_buf);
-}
-
-void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
- TXFM_2D_FLIP_CFG cfg;
- av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
- (void)bd;
- fwd_txfm2d_64x64_neon(input, output, stride, &cfg, txfm_buf);
+ shift_right_2_round_s32_x4(buf1, buf1, 512);
+ store_buffer_64x32(buf1, output, /*stride=*/32);
}
diff --git a/av1/encoder/arm/neon/highbd_pickrst_neon.c b/av1/encoder/arm/neon/highbd_pickrst_neon.c
new file mode 100644
index 000000000..76e0344fa
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_pickrst_neon.c
@@ -0,0 +1,741 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void highbd_calc_proj_params_r0_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t h01_lo = vdupq_n_s64(0);
+ int64x2_t h01_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt0_ptr = flt0;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f0_lo = vsubq_s32(f0_lo, u_lo);
+ f0_hi = vsubq_s32(f0_hi, u_hi);
+ f1_lo = vsubq_s32(f1_lo, u_lo);
+ f1_hi = vsubq_s32(f1_hi, u_hi);
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+ h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+ h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+ h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ H[1][0] = H[0][1];
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r0_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt0_ptr = flt0;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f0_lo = vsubq_s32(f0_lo, u_lo);
+ f0_hi = vsubq_s32(f0_hi, u_hi);
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f1_lo = vsubq_s32(f1_lo, u_lo);
+ f1_hi = vsubq_s32(f1_hi, u_hi);
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+static int16_t highbd_find_average_neon(const int16_t *src, int src_stride,
+ int width, int height) {
+ assert(width > 0);
+ assert(height > 0);
+
+ int64x2_t sum_s64 = vdupq_n_s64(0);
+ int64_t sum = 0;
+
+ int h = height;
+ do {
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int w = width;
+ const int16_t *row = src;
+ while (w >= 32) {
+ int16x8_t s0 = vld1q_s16(row + 0);
+ int16x8_t s1 = vld1q_s16(row + 8);
+ int16x8_t s2 = vld1q_s16(row + 16);
+ int16x8_t s3 = vld1q_s16(row + 24);
+
+ s0 = vaddq_s16(s0, s1);
+ s2 = vaddq_s16(s2, s3);
+ sum_s32[0] = vpadalq_s16(sum_s32[0], s0);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], s2);
+
+ row += 32;
+ w -= 32;
+ }
+
+ if (w >= 16) {
+ int16x8_t s0 = vld1q_s16(row + 0);
+ int16x8_t s1 = vld1q_s16(row + 8);
+
+ s0 = vaddq_s16(s0, s1);
+ sum_s32[0] = vpadalq_s16(sum_s32[0], s0);
+
+ row += 16;
+ w -= 16;
+ }
+
+ if (w >= 8) {
+ int16x8_t s0 = vld1q_s16(row);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], s0);
+
+ row += 8;
+ w -= 8;
+ }
+
+ if (w >= 4) {
+ int16x8_t s0 = vcombine_s16(vld1_s16(row), vdup_n_s16(0));
+ sum_s32[0] = vpadalq_s16(sum_s32[0], s0);
+
+ row += 4;
+ w -= 4;
+ }
+
+ while (w-- > 0) {
+ sum += *row++;
+ }
+
+ sum_s64 = vpadalq_s32(sum_s64, vaddq_s32(sum_s32[0], sum_s32[1]));
+
+ src += src_stride;
+ } while (--h != 0);
+ return (int16_t)((horizontal_add_s64x2(sum_s64) + sum) / (height * width));
+}
+
+static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+ const int wiener_win,
+ const int wiener_win2) {
+ for (int row0 = 0; row0 < wiener_win; row0++) {
+ for (int row1 = row0; row1 < wiener_win; row1++) {
+ int auto_cov_idx =
+ (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+ int32x4_t auto_cov =
+ vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1]));
+ auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]),
+ vget_high_s16(dgd[row1]));
+
+ H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+ }
+ }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win7_neon(
+ const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride,
+ int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) {
+ const int wiener_win = 7;
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+ // We use an intermediate matrix that will be transposed to get M.
+ int64_t M_trn[49];
+ memset(M_trn, 0, sizeof(M_trn));
+
+ int16x8_t vavg = vdupq_n_s16(avg);
+ do {
+ // Cross-correlation (M).
+ for (int row = 0; row < wiener_win; row++) {
+ int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg);
+ int j = 0;
+ while (j <= width - 8) {
+ int16x8_t dgd1 =
+ vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg);
+ int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+ dgd0 = dgd1;
+ j += 8;
+ }
+ // Process remaining elements without Neon.
+ while (j < width) {
+ int16_t s = src[j] - avg;
+ int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg;
+ int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg;
+ int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg;
+ int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg;
+ int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg;
+ int16_t d5 = dgd[row * dgd_stride + 5 + j] - avg;
+ int16_t d6 = dgd[row * dgd_stride + 6 + j] - avg;
+
+ M_trn[row * wiener_win + 0] += d0 * s;
+ M_trn[row * wiener_win + 1] += d1 * s;
+ M_trn[row * wiener_win + 2] += d2 * s;
+ M_trn[row * wiener_win + 3] += d3 * s;
+ M_trn[row * wiener_win + 4] += d4 * s;
+ M_trn[row * wiener_win + 5] += d5 * s;
+ M_trn[row * wiener_win + 6] += d6 * s;
+
+ j++;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ int16x8_t dgd0[7];
+ dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+ dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+ dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+ dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+ dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+ dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg);
+ dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg);
+
+ // Perform computation of the first column with itself (28 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column.
+ int16x8_t dgd1[7];
+ dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+ dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+ dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+ dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+ dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+ dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg);
+ dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ j += 8;
+ }
+
+ if (j < width) {
+ // Process remaining columns using a mask to discard excess elements.
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[7];
+ dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+ dgd0[0] = vandq_s16(dgd0[0], mask);
+ dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+ dgd0[1] = vandq_s16(dgd0[1], mask);
+ dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+ dgd0[2] = vandq_s16(dgd0[2], mask);
+ dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+ dgd0[3] = vandq_s16(dgd0[3], mask);
+ dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+ dgd0[4] = vandq_s16(dgd0[4], mask);
+ dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg);
+ dgd0[5] = vandq_s16(dgd0[5], mask);
+ dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg);
+ dgd0[6] = vandq_s16(dgd0[6], mask);
+
+ // Perform computation of the first column with itself (28 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column.
+ int16x8_t dgd1[7];
+ dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+ dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+ dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+ dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+ dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+ dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg);
+ dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ }
+ dgd += dgd_stride;
+ src += src_stride;
+ } while (--height != 0);
+
+ // Transpose M_trn.
+ transpose_M_win7(M, M_trn, 7);
+
+ // Copy upper triangle of H in the lower one.
+ copy_upper_triangle(H, wiener_win2);
+
+ // Scaling the results.
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12) {
+ bit_depth_divider = 16;
+ } else if (bit_depth == AOM_BITS_10) {
+ bit_depth_divider = 4;
+ }
+
+ for (int i = 0; i < wiener_win2; ++i) {
+ M[i] /= bit_depth_divider;
+ for (int j = 0; j < wiener_win2; ++j) {
+ H[i * wiener_win2 + j] /= bit_depth_divider;
+ }
+ }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win5_neon(
+ const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride,
+ int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) {
+ const int wiener_win = 5;
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+ // We use an intermediate matrix that will be transposed to get M.
+ int64_t M_trn[25];
+ memset(M_trn, 0, sizeof(M_trn));
+
+ int16x8_t vavg = vdupq_n_s16(avg);
+ do {
+ // Cross-correlation (M).
+ for (int row = 0; row < wiener_win; row++) {
+ int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg);
+ int j = 0;
+ while (j <= width - 8) {
+ int16x8_t dgd1 =
+ vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg);
+ int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+ dgd0 = dgd1;
+ j += 8;
+ }
+ // Process remaining elements without Neon.
+ while (j < width) {
+ int16_t s = src[j] - avg;
+ int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg;
+ int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg;
+ int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg;
+ int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg;
+ int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg;
+
+ M_trn[row * wiener_win + 0] += d0 * s;
+ M_trn[row * wiener_win + 1] += d1 * s;
+ M_trn[row * wiener_win + 2] += d2 * s;
+ M_trn[row * wiener_win + 3] += d3 * s;
+ M_trn[row * wiener_win + 4] += d4 * s;
+
+ j++;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[5];
+ dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+ dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+ dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+ dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+ dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column.
+ int16x8_t dgd1[5];
+ dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+ dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+ dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+ dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+ dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ j += 8;
+ }
+
+ if (j < width) {
+ // Process remaining columns using a mask to discard excess elements.
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[5];
+ dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+ dgd0[0] = vandq_s16(dgd0[0], mask);
+ dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+ dgd0[1] = vandq_s16(dgd0[1], mask);
+ dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+ dgd0[2] = vandq_s16(dgd0[2], mask);
+ dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+ dgd0[3] = vandq_s16(dgd0[3], mask);
+ dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+ dgd0[4] = vandq_s16(dgd0[4], mask);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column.
+ int16x8_t dgd1[5];
+ dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+ dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+ dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+ dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+ dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ }
+ dgd += dgd_stride;
+ src += src_stride;
+ } while (--height != 0);
+
+ // Transpose M_trn.
+ transpose_M_win5(M, M_trn, 5);
+
+ // Copy upper triangle of H in the lower one.
+ copy_upper_triangle(H, wiener_win2);
+
+ // Scaling the results.
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12) {
+ bit_depth_divider = 16;
+ } else if (bit_depth == AOM_BITS_10) {
+ bit_depth_divider = 4;
+ }
+
+ for (int i = 0; i < wiener_win2; ++i) {
+ M[i] /= bit_depth_divider;
+ for (int j = 0; j < wiener_win2; ++j) {
+ H[i * wiener_win2 + j] /= bit_depth_divider;
+ }
+ }
+}
+
+void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED);
+
+ const int wiener_halfwin = wiener_win >> 1;
+ const int wiener_win2 = wiener_win * wiener_win;
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+ const int16_t *src = (const int16_t *)CONVERT_TO_SHORTPTR(src8);
+ const int16_t *dgd = (const int16_t *)CONVERT_TO_SHORTPTR(dgd8);
+ const int height = v_end - v_start;
+ const int width = h_end - h_start;
+ const int vert_offset = v_start - wiener_halfwin;
+ const int horiz_offset = h_start - wiener_halfwin;
+
+ int16_t avg = highbd_find_average_neon(dgd + v_start * dgd_stride + h_start,
+ dgd_stride, width, height);
+
+ src += v_start * src_stride + h_start;
+ dgd += vert_offset * dgd_stride + horiz_offset;
+
+ if (wiener_win == WIENER_WIN) {
+ highbd_compute_stats_win7_neon(dgd, dgd_stride, src, src_stride, width,
+ height, M, H, avg, bit_depth);
+ } else {
+ highbd_compute_stats_win5_neon(dgd, dgd_stride, src, src_stride, width,
+ height, M, H, avg, bit_depth);
+ }
+}
diff --git a/av1/encoder/arm/neon/highbd_rdopt_neon.c b/av1/encoder/arm/neon/highbd_rdopt_neon.c
new file mode 100644
index 000000000..4bf7ae6ce
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_rdopt_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_highbd_block_error_neon(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bd) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ const int shift = 2 * (bd - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int32x4_t c = vld1q_s32(coeff);
+ const int32x4_t d = vld1q_s32(dqcoeff);
+
+ const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+ err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+ err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+ ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+ ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+ coeff += 4;
+ dqcoeff += 4;
+ block_size -= 4;
+ } while (block_size != 0);
+
+ *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift;
+ return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift;
+}
diff --git a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
new file mode 100644
index 000000000..88e176f56
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_squared_error(
+ const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2,
+ const uint32_t stride2, const uint32_t block_width,
+ const uint32_t block_height, uint32_t *frame_sse,
+ const unsigned int dst_stride) {
+ uint32_t *dst = frame_sse;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j);
+ uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint32x4_t sse_lo =
+ vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff));
+ uint32x4_t sse_hi =
+ vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff));
+
+ vst1q_u32(dst + j, sse_lo);
+ vst1q_u32(dst + j + 4, sse_hi);
+
+ j += 8;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ i++;
+ } while (i < block_height);
+}
+
+static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2],
+ const uint32x4_t mask_single) {
+ uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single);
+ return horizontal_add_u32x4(vsums);
+}
+
+static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2],
+ const uint32x4_t mask1,
+ const uint32x4_t mask2) {
+ uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[1][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[2][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[3][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[4][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[0][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[1][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[2][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[3][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[4][1], mask2);
+ return vsums;
+}
+
+static void highbd_apply_temporal_filter(
+ const uint16_t *frame, const unsigned int stride,
+ const uint32_t block_width, const uint32_t block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ const uint32_t *frame_sse, const uint32_t frame_sse_stride,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl,
+ int bd) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW] = { 0 };
+ const int half_window = TF_WINDOW_LENGTH >> 1;
+
+ uint32x4_t vsrc[5][2] = { 0 };
+ const uint32x4_t k0000 = vdupq_n_u32(0);
+ const uint32x4_t k1111 = vdupq_n_u32(1);
+ const uint32_t k3110_u32[4] = { 0, 1, 1, 3 };
+ const uint32_t k2111_u32[4] = { 1, 1, 1, 2 };
+ const uint32_t k1112_u32[4] = { 2, 1, 1, 1 };
+ const uint32_t k0113_u32[4] = { 3, 1, 1, 0 };
+ const uint32x4_t k3110 = vld1q_u32(k3110_u32);
+ const uint32x4_t k2111 = vld1q_u32(k2111_u32);
+ const uint32x4_t k1112 = vld1q_u32(k1112_u32);
+ const uint32x4_t k0113 = vld1q_u32(k0113_u32);
+
+ uint32x4_t vmask1[4], vmask2[4];
+ vmask1[0] = k1111;
+ vmask2[0] = vextq_u32(k1111, k0000, 3);
+ vmask1[1] = vextq_u32(k0000, k1111, 3);
+ vmask2[1] = vextq_u32(k1111, k0000, 2);
+ vmask1[2] = vextq_u32(k0000, k1111, 2);
+ vmask2[2] = vextq_u32(k1111, k0000, 1);
+ vmask1[3] = vextq_u32(k0000, k1111, 1);
+ vmask2[3] = k1111;
+
+ uint32_t row = 0;
+ do {
+ uint32_t col = 0;
+ const uint32_t *src = frame_sse + row * frame_sse_stride;
+ if (row == 0) {
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First 2 rows of the 5x5 matrix are padded from the 1st.
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[1][0] = vsrc[2][0];
+ } else if (row == 1) {
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First row of the 5x5 matrix are padded from the 1st.
+ vsrc[0][0] = vsrc[1][0];
+ } else if (row == block_height - 2) {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+
+ // Last row of the 5x5 matrix are padded from the one before.
+ vsrc[4][0] = vsrc[3][0];
+ } else if (row == block_height - 1) {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+
+ // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+ vsrc[3][0] = vsrc[2][0];
+ vsrc[4][0] = vsrc[2][0];
+ } else {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+ }
+
+ acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113);
+ acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112);
+
+ col += 4;
+ src += 4;
+ // Traverse 4 columns at a time
+ do {
+ if (row == 0) {
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First 2 rows of the 5x5 matrix are padded from the 1st.
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][1] = vsrc[2][1];
+ } else if (row == 1) {
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First row of the 5x5 matrix are padded from the 1st.
+ vsrc[0][1] = vsrc[1][1];
+ } else if (row == block_height - 2) {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+
+ // Last row of the 5x5 matrix are padded from the one before.
+ vsrc[4][1] = vsrc[3][1];
+ } else if (row == block_height - 1) {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+
+ // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+ vsrc[3][1] = vsrc[2][1];
+ vsrc[4][1] = vsrc[2][1];
+ } else {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+ }
+
+ uint32x4_t sums[4];
+ sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]);
+ sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]);
+ sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]);
+ sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]);
+ vst1q_u32(&acc_5x5_neon[row][col - half_window],
+ horizontal_add_4d_u32x4(sums));
+
+ vsrc[0][0] = vsrc[0][1];
+ vsrc[1][0] = vsrc[1][1];
+ vsrc[2][0] = vsrc[2][1];
+ vsrc[3][0] = vsrc[3][1];
+ vsrc[4][0] = vsrc[4][1];
+
+ src += 4;
+ col += 4;
+ } while (col <= block_width - 4);
+
+ acc_5x5_neon[row][col - half_window] =
+ sum_kernel5x5_mask_single(vsrc, k2111);
+ acc_5x5_neon[row][col - half_window + 1] =
+ sum_kernel5x5_mask_single(vsrc, k3110);
+
+ row++;
+ } while (row < block_height);
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ // Scale down the difference for high bit depth input.
+ const uint32_t diff_sse =
+ (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ // Scale down the difference for high bit depth input.
+ const uint32_t diff_sse =
+ (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_highbd_apply_temporal_filter_neon(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+ assert(is_high_bitdepth);
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint32_t frame_sse[BW * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const uint32_t frame_sse_stride = plane_w;
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint16_t *ref =
+ CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ const int ww = frame_sse_stride
+ << ss_x_shift; // Width of Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx];
+ }
+ }
+ }
+ }
+ }
+ get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_sse, frame_sse_stride);
+
+ highbd_apply_temporal_filter(
+ pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
+ accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride,
+ luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
+
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src,
+ int height, int width,
+ int stride,
+ int bitdepth,
+ int edge_thresh) {
+ uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+ uint64x2_t acc = vdupq_n_u64(0);
+ // Count is in theory positive as it counts the number of times we're under
+ // the threshold, but it will be counted negatively in order to make best use
+ // of the vclt instruction, which sets every bit of a lane to 1 when the
+ // condition is true.
+ int32x4_t count = vdupq_n_s32(0);
+ int final_count = 0;
+ uint64_t final_acc = 0;
+ const uint16_t *src_start = src + stride + 1;
+ int h = 1;
+
+ do {
+ int w = 1;
+ const uint16_t *src_ptr = src_start;
+
+ while (w <= (width - 1) - 8) {
+ uint16x8_t mat[3][3];
+ mat[0][0] = vld1q_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1q_u16(src_ptr - stride);
+ mat[0][2] = vld1q_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1q_u16(src_ptr - 1);
+ mat[1][1] = vld1q_u16(src_ptr);
+ mat[1][2] = vld1q_u16(src_ptr + 1);
+ mat[2][0] = vld1q_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1q_u16(src_ptr + stride);
+ mat[2][2] = vld1q_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+ ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+ uint16x8_t center = vshlq_n_u16(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj);
+ v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16);
+ uint32x4_t v_u32 = vpaddlq_u16(v);
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 8;
+ src_ptr += 8;
+ }
+
+ if (w <= (width - 1) - 4) {
+ uint16x4_t mat[3][3];
+ mat[0][0] = vld1_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1_u16(src_ptr - stride);
+ mat[0][2] = vld1_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1_u16(src_ptr - 1);
+ mat[1][1] = vld1_u16(src_ptr);
+ mat[1][2] = vld1_u16(src_ptr + 1);
+ mat[2][0] = vld1_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1_u16(src_ptr + stride);
+ mat[2][2] = vld1_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]);
+ uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]);
+ gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0]));
+ gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2]));
+
+ uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]);
+ gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1]));
+ gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1]));
+
+ uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb);
+ ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh));
+
+ uint16x4_t center = vshl_n_u16(mat[1][1], 2);
+
+ uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]);
+ uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]);
+ uint16x4_t adj = vadd_u16(adj0, adj1);
+ adj = vadd_u16(adj, adj);
+
+ uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]);
+ uint16x4_t diag = vadd_u16(diag0, diag1);
+
+ uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj);
+ v = vand_u16(v, thresh_u16);
+ uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth)));
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16));
+
+ w += 4;
+ src_ptr += 4;
+ }
+
+ while (w < width - 1) {
+ int mat[3][3];
+ mat[0][0] = *(src_ptr - stride - 1);
+ mat[0][1] = *(src_ptr - stride);
+ mat[0][2] = *(src_ptr - stride + 1);
+ mat[1][0] = *(src_ptr - 1);
+ mat[1][1] = *(src_ptr);
+ mat[1][2] = *(src_ptr + 1);
+ mat[2][0] = *(src_ptr + stride - 1);
+ mat[2][1] = *(src_ptr + stride);
+ mat[2][2] = *(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8);
+
+ // Accumulate Laplacian.
+ const int is_under = ga < edge_thresh;
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under;
+ final_count += is_under;
+
+ src_ptr++;
+ w++;
+ }
+ src_start += stride;
+ } while (++h < height - 1);
+
+ // We counted negatively, so subtract to get the final value.
+ final_count -= horizontal_add_s32x4(count);
+ final_acc += horizontal_add_u64x2(acc);
+ return (final_count < 16)
+ ? -1.0
+ : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/av1/encoder/arm/neon/pickrst_neon.c b/av1/encoder/arm/neon/pickrst_neon.c
new file mode 100644
index 000000000..62270280b
--- /dev/null
+++ b/av1/encoder/arm/neon/pickrst_neon.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+int64_t av1_lowbd_pixel_proj_error_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const int32x4_t zero = vdupq_n_s32(0);
+ uint64x2_t sum64 = vreinterpretq_u64_s32(zero);
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ int32x4_t err0 = zero;
+ for (j = 0; j <= width - 8; j += 8) {
+ const uint8x8_t d0 = vld1_u8(&dat[j]);
+ const uint8x8_t s0 = vld1_u8(&src[j]);
+ const int16x8_t flt0_16b =
+ vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])),
+ vqmovn_s32(vld1q_s32(&flt0[j + 4])));
+ const int16x8_t flt1_16b =
+ vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])),
+ vqmovn_s32(vld1q_s32(&flt1[j + 4])));
+ const int16x8_t u0 =
+ vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS));
+ const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0);
+ const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0);
+ const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u);
+ const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u);
+ const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u);
+ const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u);
+
+ int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]);
+ v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]);
+ int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]);
+ v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]);
+ const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+ const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+ const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1),
+ vreinterpretq_s16_u16(vsubl_u8(d0, s0)));
+ const int16x4_t e0_lo = vget_low_s16(e0);
+ const int16x4_t e0_hi = vget_high_s16(e0);
+ err0 = vmlal_s16(err0, e0_lo, e0_lo);
+ err0 = vmlal_s16(err0, e0_hi, e0_hi);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+ }
+
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ int32x4_t err0 = zero;
+ for (j = 0; j <= width - 8; j += 8) {
+ const uint8x8_t d0 = vld1_u8(&dat[j]);
+ const uint8x8_t s0 = vld1_u8(&src[j]);
+ const uint16x8_t d0s0 = vsubl_u8(d0, s0);
+ const uint16x8x2_t d0w =
+ vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero));
+
+ const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]);
+ const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]);
+
+ int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active);
+ v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]),
+ xq_active * (1 << SGRPROJ_RST_BITS));
+ int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active);
+ v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]),
+ xq_active * (1 << SGRPROJ_RST_BITS));
+ const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+ const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+ const int16x8_t e0 =
+ vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0));
+ const int16x4_t e0_lo = vget_low_s16(e0);
+ const int16x4_t e0_hi = vget_high_s16(e0);
+ err0 = vmlal_s16(err0, e0_lo, e0_lo);
+ err0 = vmlal_s16(err0, e0_hi, e0_hi);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+ int32_t v = xq_active * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+ }
+ } else {
+ uint32x4_t err0 = vreinterpretq_u32_s32(zero);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const uint8x16_t d = vld1q_u8(&dat[j]);
+ const uint8x16_t s = vld1q_u8(&src[j]);
+ const uint8x16_t diff = vabdq_u8(d, s);
+ const uint8x8_t diff0 = vget_low_u8(diff);
+ const uint8x8_t diff1 = vget_high_u8(diff);
+ err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0));
+ err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1));
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ sum64 = vpaddlq_u32(err0);
+ }
+#if AOM_ARCH_AARCH64
+ err += vaddvq_u64(sum64);
+#else
+ err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
+#endif // AOM_ARCH_AARCH64
+ return err;
+}
+
+static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+
+ if (width >= 16) {
+ int h = 0;
+ // We can accumulate up to 257 8-bit values in a 16-bit value, given
+ // that each 16-bit vector has 8 elements, that means we can process up to
+ // int(257*8/width) rows before we need to widen to 32-bit vector
+ // elements.
+ int h_overflow = 257 * 8 / width;
+ int h_limit = height > h_overflow ? h_overflow : height;
+ uint32x4_t avg_u32 = vdupq_n_u32(0);
+ do {
+ uint16x8_t avg_u16 = vdupq_n_u16(0);
+ do {
+ int j = width;
+ const uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ avg_u16 = vpadalq_u8(avg_u16, s);
+ j -= 16;
+ src_ptr += 16;
+ } while (j >= 16);
+ if (j >= 8) {
+ uint8x8_t s = vld1_u8(src_ptr);
+ avg_u16 = vaddw_u8(avg_u16, s);
+ j -= 8;
+ src_ptr += 8;
+ }
+ // Scalar tail case.
+ while (j > 0) {
+ sum += src[width - j];
+ j--;
+ }
+ src += src_stride;
+ } while (++h < h_limit);
+ avg_u32 = vpadalq_u16(avg_u32, avg_u16);
+
+ h_limit += h_overflow;
+ h_limit = height > h_overflow ? h_overflow : height;
+ } while (h < height);
+ return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
+ (width * height));
+ }
+ if (width >= 8) {
+ int h = 0;
+ // We can accumulate up to 257 8-bit values in a 16-bit value, given
+ // that each 16-bit vector has 4 elements, that means we can process up to
+ // int(257*4/width) rows before we need to widen to 32-bit vector
+ // elements.
+ int h_overflow = 257 * 4 / width;
+ int h_limit = height > h_overflow ? h_overflow : height;
+ uint32x2_t avg_u32 = vdup_n_u32(0);
+ do {
+ uint16x4_t avg_u16 = vdup_n_u16(0);
+ do {
+ int j = width;
+ const uint8_t *src_ptr = src;
+ uint8x8_t s = vld1_u8(src_ptr);
+ avg_u16 = vpadal_u8(avg_u16, s);
+ j -= 8;
+ src_ptr += 8;
+ // Scalar tail case.
+ while (j > 0) {
+ sum += src[width - j];
+ j--;
+ }
+ src += src_stride;
+ } while (++h < h_limit);
+ avg_u32 = vpadal_u16(avg_u32, avg_u16);
+
+ h_limit += h_overflow;
+ h_limit = height > h_overflow ? h_overflow : height;
+ } while (h < height);
+ return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
+ (width * height));
+ }
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sum += src[j];
+ } while (++j < width);
+ src += src_stride;
+ } while (--i != 0);
+ return (uint8_t)(sum / (width * height));
+}
+
+static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
+ int16_t *buf_avg, int buf_avg_stride,
+ int width, int height,
+ int downsample_factor) {
+ uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+ if (width > 8) {
+ int i = 0;
+ do {
+ int j = width;
+ const uint8_t *buf_ptr = buf;
+ int16_t *buf_avg_ptr = buf_avg;
+ do {
+ uint8x8_t d = vld1_u8(buf_ptr);
+ vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
+
+ j -= 8;
+ buf_ptr += 8;
+ buf_avg_ptr += 8;
+ } while (j >= 8);
+ while (j > 0) {
+ *buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg;
+ buf_avg_ptr++;
+ j--;
+ }
+ buf += buf_stride;
+ buf_avg += buf_avg_stride;
+ i += downsample_factor;
+ } while (i < height);
+ } else {
+ // For width < 8, don't use Neon.
+ for (int i = 0; i < height; i = i + downsample_factor) {
+ for (int j = 0; j < width; j++) {
+ buf_avg[j] = (int16_t)buf[j] - (int16_t)avg;
+ }
+ buf += buf_stride;
+ buf_avg += buf_avg_stride;
+ }
+ }
+}
+
+static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+ const int wiener_win,
+ const int wiener_win2, int32x4_t df_s32) {
+ for (int row0 = 0; row0 < wiener_win; row0++) {
+ for (int row1 = row0; row1 < wiener_win; row1++) {
+ int auto_cov_idx =
+ (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+ int32x4_t auto_cov =
+ vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1]));
+ auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]),
+ vget_high_s16(dgd[row1]));
+ auto_cov = vshlq_s32(auto_cov, df_s32);
+
+ H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+ }
+ }
+}
+
+static INLINE void compute_H_one_col_last_row(int16x8_t *dgd, int col,
+ int64_t *H, const int wiener_win,
+ const int wiener_win2,
+ int last_row_df) {
+ for (int row0 = 0; row0 < wiener_win; row0++) {
+ for (int row1 = row0; row1 < wiener_win; row1++) {
+ int auto_cov_idx =
+ (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+ int32x4_t auto_cov =
+ vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1]));
+ auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]),
+ vget_high_s16(dgd[row1]));
+ auto_cov = vmulq_n_s32(auto_cov, last_row_df);
+
+ H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+ }
+ }
+}
+
+// When we load 8 values of int16_t type and need less than 8 values for
+// processing, the below mask is used to make the extra values zero.
+const int16_t av1_neon_mask_16bit[16] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win7_neon(int16_t *dgd_avg, int dgd_avg_stride,
+ int16_t *src_avg, int src_avg_stride,
+ int width, int v_start, int v_end,
+ int64_t *M, int64_t *H,
+ int downsample_factor,
+ int last_row_downsample_factor) {
+ const int wiener_win = 7;
+ const int wiener_win2 = wiener_win * wiener_win;
+ // The downsample factor can be either 1 or 4, so instead of multiplying the
+ // values by 1 or 4, we can left shift by 0 or 2 respectively, which is
+ // faster. (This doesn't apply to the last row where we can scale the values
+ // by 1, 2 or 3, so we keep the multiplication).
+ const int downsample_shift = downsample_factor >> 1;
+ const int16x8_t df_s16 = vdupq_n_s16(downsample_shift);
+ const int32x4_t df_s32 = vdupq_n_s32(downsample_shift);
+ const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+ // We use an intermediate matrix that will be transposed to get M.
+ int64_t M_trn[49];
+ memset(M_trn, 0, sizeof(M_trn));
+
+ int h = v_start;
+ do {
+ // Cross-correlation (M).
+ for (int row = 0; row < wiener_win; row++) {
+ int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+ int j = 0;
+ while (j <= width - 8) {
+ int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+ // Load src and scale based on downsampling factor.
+ int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+ dgd0 = dgd1;
+ j += 8;
+ }
+ // Process remaining elements without Neon.
+ while (j < width) {
+ int16_t s = src_avg[j] * downsample_factor;
+ int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+ int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+ int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+ int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+ int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+ int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j];
+ int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j];
+
+ M_trn[row * wiener_win + 0] += d0 * s;
+ M_trn[row * wiener_win + 1] += d1 * s;
+ M_trn[row * wiener_win + 2] += d2 * s;
+ M_trn[row * wiener_win + 3] += d3 * s;
+ M_trn[row * wiener_win + 4] += d4 * s;
+ M_trn[row * wiener_win + 5] += d5 * s;
+ M_trn[row * wiener_win + 6] += d6 * s;
+
+ j++;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ int16x8_t dgd0[7];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+ dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+ dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+
+ // Perform computation of the first column with itself (28 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[7];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+ dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+ dgd1[5] = vshlq_s16(dgd1[5], df_s16);
+ dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+ dgd1[6] = vshlq_s16(dgd1[6], df_s16);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ j += 8;
+ }
+
+ if (j < width) {
+ // Process remaining columns using a mask to discard excess elements.
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[7];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[0] = vandq_s16(dgd0[0], mask);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[1] = vandq_s16(dgd0[1], mask);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[2] = vandq_s16(dgd0[2], mask);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[3] = vandq_s16(dgd0[3], mask);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+ dgd0[4] = vandq_s16(dgd0[4], mask);
+ dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+ dgd0[5] = vandq_s16(dgd0[5], mask);
+ dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+ dgd0[6] = vandq_s16(dgd0[6], mask);
+
+ // Perform computation of the first column with itself (28 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[7];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+ dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+ dgd1[5] = vshlq_s16(dgd1[5], df_s16);
+ dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+ dgd1[6] = vshlq_s16(dgd1[6], df_s16);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ }
+ dgd_avg += downsample_factor * dgd_avg_stride;
+ src_avg += src_avg_stride;
+ h += downsample_factor;
+ } while (h <= v_end - downsample_factor);
+
+ if (h < v_end) {
+ // The last row is scaled by a different downsample factor, so process
+ // separately.
+
+ // Cross-correlation (M).
+ for (int row = 0; row < 7; row++) {
+ int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+ int j = 0;
+ while (j <= width - 8) {
+ int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+ // Load src vector and scale based on downsampling factor.
+ int16x8_t s =
+ vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+ dgd0 = dgd1;
+ j += 8;
+ }
+ // Process remaining elements without Neon.
+ while (j < width) {
+ int16_t s = src_avg[j];
+ int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+ int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+ int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+ int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+ int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+ int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j];
+ int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j];
+
+ M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 5] += d5 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 6] += d6 * s * last_row_downsample_factor;
+
+ j++;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ int col0 = 0;
+ do {
+ // Load first column.
+ int16x8_t dgd0[7];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+ dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+ dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+
+ // Perform computation of the first column with itself (28 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+ last_row_downsample_factor);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[7];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+ dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+ dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor);
+ dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+ dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ } while (++col0 < wiener_win);
+ j += 8;
+ }
+
+ // Process remaining columns using a mask to discard excess elements.
+ if (j < width) {
+ int col0 = 0;
+ do {
+ // Load first column.
+ int16x8_t dgd0[7];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[0] = vandq_s16(dgd0[0], mask);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[1] = vandq_s16(dgd0[1], mask);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[2] = vandq_s16(dgd0[2], mask);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[3] = vandq_s16(dgd0[3], mask);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+ dgd0[4] = vandq_s16(dgd0[4], mask);
+ dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+ dgd0[5] = vandq_s16(dgd0[5], mask);
+ dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+ dgd0[6] = vandq_s16(dgd0[6], mask);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+ last_row_downsample_factor);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[7];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+ dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+ dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor);
+ dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+ dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ } while (++col0 < wiener_win);
+ }
+ }
+
+ // Transpose M_trn.
+ transpose_M_win7(M, M_trn, 7);
+
+ // Copy upper triangle of H in the lower one.
+ copy_upper_triangle(H, wiener_win2);
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win5_neon(int16_t *dgd_avg, int dgd_avg_stride,
+ int16_t *src_avg, int src_avg_stride,
+ int width, int v_start, int v_end,
+ int64_t *M, int64_t *H,
+ int downsample_factor,
+ int last_row_downsample_factor) {
+ const int wiener_win = 5;
+ const int wiener_win2 = wiener_win * wiener_win;
+ // The downsample factor can be either 1 or 4, so instead of multiplying the
+ // values by 1 or 4, we can left shift by 0 or 2 respectively, which is
+ // faster. (This doesn't apply to the last row where we can scale the values
+ // by 1, 2 or 3, so we keep the multiplication).
+ const int downsample_shift = downsample_factor >> 1;
+ const int16x8_t df_s16 = vdupq_n_s16(downsample_shift);
+ const int32x4_t df_s32 = vdupq_n_s32(downsample_shift);
+ const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+ // We use an intermediate matrix that will be transposed to get M.
+ int64_t M_trn[25];
+ memset(M_trn, 0, sizeof(M_trn));
+
+ int h = v_start;
+ do {
+ // Cross-correlation (M).
+ for (int row = 0; row < wiener_win; row++) {
+ int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+ int j = 0;
+ while (j <= width - 8) {
+ int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+ // Load src vector and scale based on downsampling factor.
+ int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+ dgd0 = dgd1;
+ j += 8;
+ }
+
+ // Process remaining elements without Neon.
+ while (j < width) {
+ int16_t s = src_avg[j];
+ int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+ int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+ int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+ int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+ int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+
+ M_trn[row * wiener_win + 0] += d0 * s * downsample_factor;
+ M_trn[row * wiener_win + 1] += d1 * s * downsample_factor;
+ M_trn[row * wiener_win + 2] += d2 * s * downsample_factor;
+ M_trn[row * wiener_win + 3] += d3 * s * downsample_factor;
+ M_trn[row * wiener_win + 4] += d4 * s * downsample_factor;
+
+ j++;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[5];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[5];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ j += 8;
+ }
+
+ // Process remaining columns using a mask to discard excess elements.
+ if (j < width) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[5];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[0] = vandq_s16(dgd0[0], mask);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[1] = vandq_s16(dgd0[1], mask);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[2] = vandq_s16(dgd0[2], mask);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[3] = vandq_s16(dgd0[3], mask);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+ dgd0[4] = vandq_s16(dgd0[4], mask);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[5];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ }
+ dgd_avg += downsample_factor * dgd_avg_stride;
+ src_avg += src_avg_stride;
+ h += downsample_factor;
+ } while (h <= v_end - downsample_factor);
+
+ if (h < v_end) {
+ // The last row is scaled by a different downsample factor, so process
+ // separately.
+
+ // Cross-correlation (M).
+ for (int row = 0; row < wiener_win; row++) {
+ int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+ int j = 0;
+ while (j <= width - 8) {
+ int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+ // Load src vector and scale based on downsampling factor.
+ int16x8_t s =
+ vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+ dgd0 = dgd1;
+ j += 8;
+ }
+
+ // Process remaining elements without Neon.
+ while (j < width) {
+ int16_t s = src_avg[j];
+ int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+ int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+ int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+ int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+ int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+
+ M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor;
+ M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor;
+
+ j++;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[5];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+ last_row_downsample_factor);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[5];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ j += 8;
+ }
+
+ // Process remaining columns using a mask to discard excess elements.
+ if (j < width) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[5];
+ dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+ dgd0[0] = vandq_s16(dgd0[0], mask);
+ dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+ dgd0[1] = vandq_s16(dgd0[1], mask);
+ dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+ dgd0[2] = vandq_s16(dgd0[2], mask);
+ dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+ dgd0[3] = vandq_s16(dgd0[3], mask);
+ dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+ dgd0[4] = vandq_s16(dgd0[4], mask);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+ last_row_downsample_factor);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[5];
+ dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+ dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+ dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+ dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+ dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+ dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+ dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+ dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+ dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+ dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+ wiener_win2);
+ }
+ }
+ }
+ }
+
+ // Transpose M_trn.
+ transpose_M_win5(M, M_trn, 5);
+
+ // Copy upper triangle of H in the lower one.
+ copy_upper_triangle(H, wiener_win2);
+}
+
+void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = wiener_win >> 1;
+ const int32_t width = h_end - h_start;
+ const int32_t height = v_end - v_start;
+ const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+ uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
+ assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+ int dgd_avg_stride = width + 2 * wiener_halfwin;
+ int src_avg_stride = width;
+
+ // Compute (dgd - avg) and store it in dgd_avg.
+ // The wiener window will slide along the dgd frame, centered on each pixel.
+ // For the top left pixel and all the pixels on the side of the frame this
+ // means half of the window will be outside of the frame. As such the actual
+ // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+ // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+ const int vert_offset = v_start - wiener_halfwin;
+ const int horiz_offset = h_start - wiener_halfwin;
+ const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+ compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
+ width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
+
+ // Compute (src - avg), downsample if necessary and store in src-avg.
+ const uint8_t *src_start = src + h_start + v_start * src_stride;
+ compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg,
+ src_avg_stride, width, height, downsample_factor);
+
+ // Since the height is not necessarily a multiple of the downsample factor,
+ // the last line of src will be scaled according to how many rows remain.
+ int last_row_downsample_factor =
+ use_downsampled_wiener_stats ? height % downsample_factor : 1;
+
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+ width, v_start, v_end, M, H, downsample_factor,
+ last_row_downsample_factor);
+ } else {
+ compute_stats_win5_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+ width, v_start, v_end, M, H, downsample_factor,
+ last_row_downsample_factor);
+ }
+}
+
+static INLINE void calc_proj_params_r0_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t h01_lo = vdupq_n_s64(0);
+ int64x2_t h01_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt0_ptr = flt0;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+ f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+ f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+ f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+ h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+ h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+ h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ H[1][0] = H[0][1];
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt0_ptr = flt0;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+ f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt0 += flt0_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+ f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2],
+ int64_t C[2], const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
+ flt1, flt1_stride, H, C);
+ }
+}
diff --git a/av1/encoder/arm/neon/pickrst_neon.h b/av1/encoder/arm/neon/pickrst_neon.h
new file mode 100644
index 000000000..d9a9ad435
--- /dev/null
+++ b/av1/encoder/arm/neon/pickrst_neon.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+// When we load 8 values of int16_t type and need less than 8 values for
+// processing, the below mask is used to make the extra values zero.
+extern const int16_t av1_neon_mask_16bit[16];
+
+static INLINE void copy_upper_triangle(int64_t *H, const int wiener_win2) {
+ for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
+ // Transpose the first 2x2 square. It needs a special case as the element
+ // of the bottom left is on the diagonal.
+ int64x2_t row0 = vld1q_s64(H + i * wiener_win2 + i + 1);
+ int64x2_t row1 = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1);
+
+ int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
+
+ vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
+ vst1q_s64(H + (i + 2) * wiener_win2 + i, tr_row);
+
+ // Transpose and store all the remaining 2x2 squares of the line.
+ for (int j = i + 3; j < wiener_win2; j = j + 2) {
+ row0 = vld1q_s64(H + i * wiener_win2 + j);
+ row1 = vld1q_s64(H + (i + 1) * wiener_win2 + j);
+
+ int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
+ int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
+
+ vst1q_s64(H + j * wiener_win2 + i, tr_row0);
+ vst1q_s64(H + (j + 1) * wiener_win2 + i, tr_row1);
+ }
+ }
+}
+
+static INLINE void transpose_M_win5(int64_t *M, int64_t *M_trn,
+ const int wiener_win) {
+ // 1st and 2nd rows.
+ int64x2_t row00 = vld1q_s64(M_trn);
+ int64x2_t row10 = vld1q_s64(M_trn + wiener_win);
+ vst1q_s64(M, aom_vtrn1q_s64(row00, row10));
+ vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10));
+
+ int64x2_t row02 = vld1q_s64(M_trn + 2);
+ int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2);
+ vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12));
+ vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12));
+
+ // Last column only needs trn2.
+ int64x2_t row03 = vld1q_s64(M_trn + 3);
+ int64x2_t row13 = vld1q_s64(M_trn + wiener_win + 3);
+ vst1q_s64(M + 4 * wiener_win, aom_vtrn2q_s64(row03, row13));
+
+ // 3rd and 4th rows.
+ int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win);
+ int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win);
+ vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30));
+ vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30));
+
+ int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2);
+ int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2);
+ vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32));
+ vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32));
+
+ // Last column only needs trn2.
+ int64x2_t row23 = vld1q_s64(M_trn + 2 * wiener_win + 3);
+ int64x2_t row33 = vld1q_s64(M_trn + 3 * wiener_win + 3);
+ vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn2q_s64(row23, row33));
+
+ // Last row.
+ int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win);
+ vst1_s64(M + 4, vget_low_s64(row40));
+ vst1_s64(M + 1 * wiener_win + 4, vget_high_s64(row40));
+
+ int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2);
+ vst1_s64(M + 2 * wiener_win + 4, vget_low_s64(row42));
+ vst1_s64(M + 3 * wiener_win + 4, vget_high_s64(row42));
+
+ // Element on the bottom right of M_trn is copied as is.
+ vst1_s64(M + 4 * wiener_win + 4, vld1_s64(M_trn + 4 * wiener_win + 4));
+}
+
+static INLINE void transpose_M_win7(int64_t *M, int64_t *M_trn,
+ const int wiener_win) {
+ // 1st and 2nd rows.
+ int64x2_t row00 = vld1q_s64(M_trn);
+ int64x2_t row10 = vld1q_s64(M_trn + wiener_win);
+ vst1q_s64(M, aom_vtrn1q_s64(row00, row10));
+ vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10));
+
+ int64x2_t row02 = vld1q_s64(M_trn + 2);
+ int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2);
+ vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12));
+ vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12));
+
+ int64x2_t row04 = vld1q_s64(M_trn + 4);
+ int64x2_t row14 = vld1q_s64(M_trn + wiener_win + 4);
+ vst1q_s64(M + 4 * wiener_win, aom_vtrn1q_s64(row04, row14));
+ vst1q_s64(M + 5 * wiener_win, aom_vtrn2q_s64(row04, row14));
+
+ // Last column only needs trn2.
+ int64x2_t row05 = vld1q_s64(M_trn + 5);
+ int64x2_t row15 = vld1q_s64(M_trn + wiener_win + 5);
+ vst1q_s64(M + 6 * wiener_win, aom_vtrn2q_s64(row05, row15));
+
+ // 3rd and 4th rows.
+ int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win);
+ int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win);
+ vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30));
+ vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30));
+
+ int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2);
+ int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2);
+ vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32));
+ vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32));
+
+ int64x2_t row24 = vld1q_s64(M_trn + 2 * wiener_win + 4);
+ int64x2_t row34 = vld1q_s64(M_trn + 3 * wiener_win + 4);
+ vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn1q_s64(row24, row34));
+ vst1q_s64(M + 5 * wiener_win + 2, aom_vtrn2q_s64(row24, row34));
+
+ // Last column only needs trn2.
+ int64x2_t row25 = vld1q_s64(M_trn + 2 * wiener_win + 5);
+ int64x2_t row35 = vld1q_s64(M_trn + 3 * wiener_win + 5);
+ vst1q_s64(M + 6 * wiener_win + 2, aom_vtrn2q_s64(row25, row35));
+
+ // 5th and 6th rows.
+ int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win);
+ int64x2_t row50 = vld1q_s64(M_trn + 5 * wiener_win);
+ vst1q_s64(M + 4, aom_vtrn1q_s64(row40, row50));
+ vst1q_s64(M + wiener_win + 4, aom_vtrn2q_s64(row40, row50));
+
+ int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2);
+ int64x2_t row52 = vld1q_s64(M_trn + 5 * wiener_win + 2);
+ vst1q_s64(M + 2 * wiener_win + 4, aom_vtrn1q_s64(row42, row52));
+ vst1q_s64(M + 3 * wiener_win + 4, aom_vtrn2q_s64(row42, row52));
+
+ int64x2_t row44 = vld1q_s64(M_trn + 4 * wiener_win + 4);
+ int64x2_t row54 = vld1q_s64(M_trn + 5 * wiener_win + 4);
+ vst1q_s64(M + 4 * wiener_win + 4, aom_vtrn1q_s64(row44, row54));
+ vst1q_s64(M + 5 * wiener_win + 4, aom_vtrn2q_s64(row44, row54));
+
+ // Last column only needs trn2.
+ int64x2_t row45 = vld1q_s64(M_trn + 4 * wiener_win + 5);
+ int64x2_t row55 = vld1q_s64(M_trn + 5 * wiener_win + 5);
+ vst1q_s64(M + 6 * wiener_win + 4, aom_vtrn2q_s64(row45, row55));
+
+ // Last row.
+ int64x2_t row60 = vld1q_s64(M_trn + 6 * wiener_win);
+ vst1_s64(M + 6, vget_low_s64(row60));
+ vst1_s64(M + 1 * wiener_win + 6, vget_high_s64(row60));
+
+ int64x2_t row62 = vld1q_s64(M_trn + 6 * wiener_win + 2);
+ vst1_s64(M + 2 * wiener_win + 6, vget_low_s64(row62));
+ vst1_s64(M + 3 * wiener_win + 6, vget_high_s64(row62));
+
+ int64x2_t row64 = vld1q_s64(M_trn + 6 * wiener_win + 4);
+ vst1_s64(M + 4 * wiener_win + 6, vget_low_s64(row64));
+ vst1_s64(M + 5 * wiener_win + 6, vget_high_s64(row64));
+
+ // Element on the bottom right of M_trn is copied as is.
+ vst1_s64(M + 6 * wiener_win + 6, vld1_s64(M_trn + 6 * wiener_win + 6));
+}
+
+static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd0,
+ int16x8_t dgd1, int64_t *M,
+ const int wiener_win, int row) {
+ int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0);
+ int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0));
+ m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0));
+
+ int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1);
+ int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01));
+ m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01));
+
+ m0 = horizontal_add_2d_s32(m0, m1);
+ m_01 = vpadalq_s32(m_01, m0);
+ vst1q_s64(M + row * wiener_win + 0, m_01);
+
+ int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2);
+ int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02));
+ m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02));
+
+ int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3);
+ int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03));
+ m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03));
+
+ m2 = horizontal_add_2d_s32(m2, m3);
+ m_23 = vpadalq_s32(m_23, m2);
+ vst1q_s64(M + row * wiener_win + 2, m_23);
+
+ int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4);
+ int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04));
+ m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04));
+ M[row * wiener_win + 4] += horizontal_long_add_s32x4(m4);
+}
+
+static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd0,
+ int16x8_t dgd1, int64_t *M,
+ const int wiener_win, int row) {
+ int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0);
+ int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2);
+ int64x2_t m_45 = vld1q_s64(M + row * wiener_win + 4);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0));
+ m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0));
+
+ int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1);
+ int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01));
+ m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01));
+
+ m0 = horizontal_add_2d_s32(m0, m1);
+ m_01 = vpadalq_s32(m_01, m0);
+ vst1q_s64(M + row * wiener_win + 0, m_01);
+
+ int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2);
+ int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02));
+ m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02));
+
+ int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3);
+ int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03));
+ m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03));
+
+ m2 = horizontal_add_2d_s32(m2, m3);
+ m_23 = vpadalq_s32(m_23, m2);
+ vst1q_s64(M + row * wiener_win + 2, m_23);
+
+ int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4);
+ int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04));
+ m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04));
+
+ int16x8_t dgd05 = vextq_s16(dgd0, dgd1, 5);
+ int32x4_t m5 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd05));
+ m5 = vmlal_s16(m5, vget_high_s16(src), vget_high_s16(dgd05));
+
+ m4 = horizontal_add_2d_s32(m4, m5);
+ m_45 = vpadalq_s32(m_45, m4);
+ vst1q_s64(M + row * wiener_win + 4, m_45);
+
+ int16x8_t dgd06 = vextq_s16(dgd0, dgd1, 6);
+ int32x4_t m6 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd06));
+ m6 = vmlal_s16(m6, vget_high_s16(src), vget_high_s16(dgd06));
+ M[row * wiener_win + 6] += horizontal_long_add_s32x4(m6);
+}
+
+static INLINE void compute_H_two_cols(int16x8_t *dgd0, int16x8_t *dgd1,
+ int col0, int col1, int64_t *H,
+ const int wiener_win,
+ const int wiener_win2) {
+ for (int row0 = 0; row0 < wiener_win; row0++) {
+ for (int row1 = 0; row1 < wiener_win; row1++) {
+ int auto_cov_idx =
+ (col0 * wiener_win + row0) * wiener_win2 + (col1 * wiener_win) + row1;
+
+ int32x4_t auto_cov =
+ vmull_s16(vget_low_s16(dgd0[row0]), vget_low_s16(dgd1[row1]));
+ auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd0[row0]),
+ vget_high_s16(dgd1[row1]));
+
+ H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+ }
+ }
+}
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
diff --git a/av1/encoder/arm/neon/picksrt_neon.c b/av1/encoder/arm/neon/picksrt_neon.c
deleted file mode 100644
index 1346d6b97..000000000
--- a/av1/encoder/arm/neon/picksrt_neon.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <math.h>
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/restoration.h"
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-int64_t av1_lowbd_pixel_proj_error_neon(
- const uint8_t *src8, int width, int height, int src_stride,
- const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
- int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
- int i, j, k;
- const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
- const int32x4_t zero = vdupq_n_s32(0);
- uint64x2_t sum64 = vreinterpretq_u64_s32(zero);
- const uint8_t *src = src8;
- const uint8_t *dat = dat8;
-
- int64_t err = 0;
- if (params->r[0] > 0 && params->r[1] > 0) {
- for (i = 0; i < height; ++i) {
- int32x4_t err0 = zero;
- for (j = 0; j <= width - 8; j += 8) {
- const uint8x8_t d0 = vld1_u8(&dat[j]);
- const uint8x8_t s0 = vld1_u8(&src[j]);
- const int16x8_t flt0_16b =
- vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])),
- vqmovn_s32(vld1q_s32(&flt0[j + 4])));
- const int16x8_t flt1_16b =
- vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])),
- vqmovn_s32(vld1q_s32(&flt1[j + 4])));
- const int16x8_t u0 =
- vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS));
- const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0);
- const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0);
- const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u);
- const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u);
- const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u);
- const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u);
-
- int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]);
- v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]);
- int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]);
- v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]);
- const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
- const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
- const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1),
- vreinterpretq_s16_u16(vsubl_u8(d0, s0)));
- const int16x4_t e0_lo = vget_low_s16(e0);
- const int16x4_t e0_hi = vget_high_s16(e0);
- err0 = vmlal_s16(err0, e0_lo, e0_lo);
- err0 = vmlal_s16(err0, e0_hi, e0_hi);
- }
- for (k = j; k < width; ++k) {
- const int32_t u = dat[k] << SGRPROJ_RST_BITS;
- int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
- const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k];
- err += e * e;
- }
- dat += dat_stride;
- src += src_stride;
- flt0 += flt0_stride;
- flt1 += flt1_stride;
- sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
- }
-
- } else if (params->r[0] > 0 || params->r[1] > 0) {
- const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
- const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
- const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
- for (i = 0; i < height; ++i) {
- int32x4_t err0 = zero;
- for (j = 0; j <= width - 8; j += 8) {
- const uint8x8_t d0 = vld1_u8(&dat[j]);
- const uint8x8_t s0 = vld1_u8(&src[j]);
- const uint16x8_t d0s0 = vsubl_u8(d0, s0);
- const uint16x8x2_t d0w =
- vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero));
-
- const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]);
- const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]);
-
- int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active);
- v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]),
- xq_active * (1 << SGRPROJ_RST_BITS));
- int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active);
- v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]),
- xq_active * (1 << SGRPROJ_RST_BITS));
- const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
- const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
- const int16x8_t e0 =
- vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0));
- const int16x4_t e0_lo = vget_low_s16(e0);
- const int16x4_t e0_hi = vget_high_s16(e0);
- err0 = vmlal_s16(err0, e0_lo, e0_lo);
- err0 = vmlal_s16(err0, e0_hi, e0_hi);
- }
- for (k = j; k < width; ++k) {
- const int32_t u = dat[k] << SGRPROJ_RST_BITS;
- int32_t v = xq_active * (flt[k] - u);
- const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
- }
- dat += dat_stride;
- src += src_stride;
- flt += flt_stride;
- sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
- }
- } else {
- uint32x4_t err0 = vreinterpretq_u32_s32(zero);
- for (i = 0; i < height; ++i) {
- for (j = 0; j <= width - 16; j += 16) {
- const uint8x16_t d = vld1q_u8(&dat[j]);
- const uint8x16_t s = vld1q_u8(&src[j]);
- const uint8x16_t diff = vabdq_u8(d, s);
- const uint8x8_t diff0 = vget_low_u8(diff);
- const uint8x8_t diff1 = vget_high_u8(diff);
- err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0));
- err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1));
- }
- for (k = j; k < width; ++k) {
- const int32_t e = dat[k] - src[k];
- err += e * e;
- }
- dat += dat_stride;
- src += src_stride;
- }
- sum64 = vpaddlq_u32(err0);
- }
-#if AOM_ARCH_AARCH64
- err += vaddvq_u64(sum64);
-#else
- err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
-#endif // AOM_ARCH_AARCH64
- return err;
-}
diff --git a/av1/encoder/arm/neon/reconinter_enc_neon.c b/av1/encoder/arm/neon/reconinter_enc_neon.c
index e5975b0c6..03afa303c 100644
--- a/av1/encoder/arm/neon/reconinter_enc_neon.c
+++ b/av1/encoder/arm/neon/reconinter_enc_neon.c
@@ -94,13 +94,13 @@ void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm,
} else if (!subpel_y_q3) {
const int16_t *const filter_x =
av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
- aom_convolve8_horiz_neon(ref, ref_stride, comp_pred, width, filter_x, 16,
- NULL, -1, width, height);
+ aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL,
+ -1, width, height);
} else if (!subpel_x_q3) {
const int16_t *const filter_y =
av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
- aom_convolve8_vert_neon(ref, ref_stride, comp_pred, width, NULL, -1,
- filter_y, 16, width, height);
+ aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y,
+ 16, width, height);
} else {
DECLARE_ALIGNED(16, uint8_t,
im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
@@ -117,11 +117,10 @@ void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm,
const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1);
assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz_neon(ref - ref_vert_offset, ref_stride, im_block,
- MAX_SB_SIZE, filter_x, 16, NULL, -1, width,
- im_height);
- aom_convolve8_vert_neon(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred,
- width, NULL, -1, filter_y, 16, width, height);
+ aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block,
+ MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height);
+ aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width,
+ NULL, -1, filter_y, 16, width, height);
}
}
@@ -138,3 +137,153 @@ void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd,
aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width);
}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+ aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+
+ aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred,
+ width, jcp_param);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd,
+ const struct AV1Common *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ if (width > 4) {
+ assert(width % 8 == 0);
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t r = vld1q_u16(ref + j);
+ vst1q_u16(comp_pred + j, r);
+ j += 8;
+ } while (j < width);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else if (width == 4) {
+ int i = height;
+ do {
+ uint16x4_t r = vld1_u16(ref);
+ vst1_u16(comp_pred, r);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else {
+ assert(width == 2);
+ int i = height / 2;
+ do {
+ uint16x4_t r = load_u16_2x2(ref, ref_stride);
+ store_u16_2x1(comp_pred + 0 * width, r, 0);
+ store_u16_2x1(comp_pred + 1 * width, r, 1);
+ ref += 2 * ref_stride;
+ comp_pred += 2 * width;
+ } while (--i != 0);
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel,
+ 16, NULL, -1, width, height, bd);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL,
+ -1, kernel, 16, width, height, bd);
+ } else {
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_highbd_convolve8_horiz_neon(
+ ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride,
+ CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+ intermediate_height, bd);
+ aom_highbd_convolve8_vert_neon(
+ CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, int subpel_search) {
+ aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8,
+ width);
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search) {
+ aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ aom_highbd_dist_wtd_comp_avg_pred_neon(comp_pred8, pred8, width, height,
+ comp_pred8, width, jcp_param);
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/arm/neon/shift_neon.h b/av1/encoder/arm/neon/shift_neon.h
new file mode 100644
index 000000000..d73aef2f2
--- /dev/null
+++ b/av1/encoder/arm/neon/shift_neon.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h" // For AOM_INLINE.
+
+#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg) \
+ static AOM_INLINE void name(const type *in, type *out, int size) { \
+ int i = 0; \
+ do { \
+ out[i] = intrinsic(in[i], arg); \
+ } while (++i < size); \
+ }
+
+SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4)
+SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4)
+
+// Addition instructions have slightly better performance compared to shift
+// instructions on some micro-architectures, so use these for shifts by one.
+
+SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i])
+SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i])
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16,
+ vdup_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16,
+ vdupq_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32,
+ vdupq_n_s32(0))
+
+#undef SHIFT_LOOP_HELPER
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
diff --git a/av1/encoder/arm/neon/temporal_filter_neon.c b/av1/encoder/arm/neon/temporal_filter_neon.c
index 163768b7c..986f14386 100644
--- a/av1/encoder/arm/neon/temporal_filter_neon.c
+++ b/av1/encoder/arm/neon/temporal_filter_neon.c
@@ -22,179 +22,6 @@
// For the squared error buffer, add padding for 4 samples.
#define SSE_STRIDE (BW + 4)
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-// clang-format off
-
-DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
- 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
- 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
- 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
- 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
-};
-
-// clang-format on
-
-static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
- const uint8_t *frame2, const uint32_t stride2,
- const uint32_t block_width,
- const uint32_t block_height,
- uint8_t *frame_abs_diff,
- const unsigned int dst_stride) {
- uint8_t *dst = frame_abs_diff;
-
- uint32_t i = 0;
- do {
- uint32_t j = 0;
- do {
- uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
- uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
- uint8x16_t abs_diff = vabdq_u8(s, r);
- vst1q_u8(dst + j + 2, abs_diff);
- j += 16;
- } while (j < block_width);
-
- dst += dst_stride;
- i++;
- } while (i < block_height);
-}
-
-static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
- const uint32_t block_width) {
- uint8x8_t s = vld1_u8(src);
-
- if (col == 0) {
- const uint8_t lane2 = vget_lane_u8(s, 2);
- s = vset_lane_u8(lane2, s, 0);
- s = vset_lane_u8(lane2, s, 1);
- } else if (col >= block_width - 4) {
- const uint8_t lane5 = vget_lane_u8(s, 5);
- s = vset_lane_u8(lane5, s, 6);
- s = vset_lane_u8(lane5, s, 7);
- }
- return vcombine_u8(s, s);
-}
-
-static void apply_temporal_filter(
- const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
- const uint32_t block_height, const int *subblock_mses,
- unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff,
- const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
- const double decay_factor, const double inv_factor,
- const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
- assert(((block_width == 16) || (block_width == 32)) &&
- ((block_height == 16) || (block_height == 32)));
-
- uint32_t acc_5x5_neon[BH][BW];
- const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
-
- // Traverse 4 columns at a time - first and last two columns need padding.
- for (uint32_t col = 0; col < block_width; col += 4) {
- uint8x16_t vsrc[5][2];
- const uint8_t *src = frame_abs_diff + col;
-
- // Load, pad (for first and last two columns) and mask 3 rows from the top.
- for (int i = 2; i < 5; i++) {
- const uint8x16_t s = load_and_pad(src, col, block_width);
- vsrc[i][0] = vandq_u8(s, vmask.val[0]);
- vsrc[i][1] = vandq_u8(s, vmask.val[1]);
- src += SSE_STRIDE;
- }
-
- // Pad the top 2 rows.
- vsrc[0][0] = vsrc[2][0];
- vsrc[0][1] = vsrc[2][1];
- vsrc[1][0] = vsrc[2][0];
- vsrc[1][1] = vsrc[2][1];
-
- for (unsigned int row = 0; row < block_height; row++) {
- uint32x4_t sum_01 = vdupq_n_u32(0);
- uint32x4_t sum_23 = vdupq_n_u32(0);
-
- sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
- sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
- sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
- sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
- sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
-
- sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
- sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
- sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
- sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
- sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
-
- vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
-
- // Push all rows in the sliding window up one.
- for (int i = 0; i < 4; i++) {
- vsrc[i][0] = vsrc[i + 1][0];
- vsrc[i][1] = vsrc[i + 1][1];
- }
-
- if (row <= block_height - 4) {
- // Load next row into the bottom of the sliding window.
- uint8x16_t s = load_and_pad(src, col, block_width);
- vsrc[4][0] = vandq_u8(s, vmask.val[0]);
- vsrc[4][1] = vandq_u8(s, vmask.val[1]);
- src += SSE_STRIDE;
- } else {
- // Pad the bottom 2 rows.
- vsrc[4][0] = vsrc[3][0];
- vsrc[4][1] = vsrc[3][1];
- }
- }
- }
-
- // Perform filtering.
- if (tf_wgt_calc_lvl == 0) {
- for (unsigned int i = 0, k = 0; i < block_height; i++) {
- for (unsigned int j = 0; j < block_width; j++, k++) {
- const int pixel_value = frame[i * stride + j];
- const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
-
- const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
- const double block_error = (double)subblock_mses[subblock_idx];
- const double combined_error =
- weight_factor * window_error + block_error * inv_factor;
- // Compute filter weight.
- double scaled_error =
- combined_error * d_factor[subblock_idx] * decay_factor;
- scaled_error = AOMMIN(scaled_error, 7);
- const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
- accumulator[k] += weight * pixel_value;
- count[k] += weight;
- }
- }
- } else {
- for (unsigned int i = 0, k = 0; i < block_height; i++) {
- for (unsigned int j = 0; j < block_width; j++, k++) {
- const int pixel_value = frame[i * stride + j];
- const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
-
- const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
- const double block_error = (double)subblock_mses[subblock_idx];
- const double combined_error =
- weight_factor * window_error + block_error * inv_factor;
- // Compute filter weight.
- double scaled_error =
- combined_error * d_factor[subblock_idx] * decay_factor;
- scaled_error = AOMMIN(scaled_error, 7);
- const float fweight =
- approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
- const int weight = iroundpf(fweight);
- accumulator[k] += weight * pixel_value;
- count[k] += weight;
- }
- }
- }
-}
-
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
@@ -230,8 +57,7 @@ static INLINE void get_squared_error(
} while (j < block_width);
dst += dst_stride;
- i++;
- } while (i < block_height);
+ } while (++i < block_height);
}
static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
@@ -351,8 +177,6 @@ static void apply_temporal_filter(
}
}
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
void av1_apply_temporal_filter_neon(
const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -393,11 +217,7 @@ void av1_apply_temporal_filter_neon(
double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
s_decay = CLIP(s_decay, 1e-5, 1);
double d_factor[4] = { 0 };
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
- uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
uint32_t luma_sse_sum[BW * BH] = { 0 };
for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
@@ -436,32 +256,6 @@ void av1_apply_temporal_filter_neon(
// search is only done on Y-plane, so the information from Y-plane
// will be more accurate. The luma sse sum is reused in both chroma
// planes.
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
- if (plane == AOM_PLANE_U) {
- for (unsigned int i = 0; i < plane_h; i++) {
- for (unsigned int j = 0; j < plane_w; j++) {
- for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
- for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
- const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
- const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
- luma_sse_sum[i * BW + j] +=
- (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
- frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
- }
- }
- }
- }
- }
-
- get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
- plane_h, frame_abs_diff, SSE_STRIDE);
-
- apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
- subblock_mses, accum + plane_offset,
- count + plane_offset, frame_abs_diff, luma_sse_sum,
- inv_num_ref_pixels, decay_factor, inv_factor,
- weight_factor, d_factor, tf_wgt_calc_lvl);
-#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
if (plane == AOM_PLANE_U) {
for (unsigned int i = 0; i < plane_h; i++) {
for (unsigned int j = 0; j < plane_w; j++) {
@@ -484,8 +278,271 @@ void av1_apply_temporal_filter_neon(
count + plane_offset, frame_sse, luma_sse_sum,
inv_num_ref_pixels, decay_factor, inv_factor,
weight_factor, d_factor, tf_wgt_calc_lvl);
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
plane_offset += plane_h * plane_w;
}
}
+
+double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+ uint32x4_t acc = vdupq_n_u32(0);
+ // Count is in theory positive as it counts the number of times we're under
+ // the threshold, but it will be counted negatively in order to make best use
+ // of the vclt instruction, which sets every bit of a lane to 1 when the
+ // condition is true.
+ int32x4_t count = vdupq_n_s32(0);
+ int final_count = 0;
+ int64_t final_acc = 0;
+ const uint8_t *src_start = src + stride + 1;
+ int h = 1;
+
+ do {
+ int w = 1;
+ const uint8_t *src_ptr = src_start;
+
+ while (w <= (width - 1) - 16) {
+ uint8x16_t mat[3][3];
+ mat[0][0] = vld1q_u8(src_ptr - stride - 1);
+ mat[0][1] = vld1q_u8(src_ptr - stride);
+ mat[0][2] = vld1q_u8(src_ptr - stride + 1);
+ mat[1][0] = vld1q_u8(src_ptr - 1);
+ mat[1][1] = vld1q_u8(src_ptr);
+ mat[1][2] = vld1q_u8(src_ptr + 1);
+ mat[2][0] = vld1q_u8(src_ptr + stride - 1);
+ mat[2][1] = vld1q_u8(src_ptr + stride);
+ mat[2][2] = vld1q_u8(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0]));
+ uint16x8_t gxa_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0]));
+ uint16x8_t gxb_lo =
+ vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2]));
+ uint16x8_t gxb_hi =
+ vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2]));
+ gxa_lo = vaddq_u16(
+ gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0])));
+ gxa_hi = vaddq_u16(
+ gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0])));
+ gxb_lo = vaddq_u16(
+ gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2])));
+ gxb_hi = vaddq_u16(
+ gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2])));
+
+ uint16x8_t gya_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+ uint16x8_t gya_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+ uint16x8_t gyb_lo =
+ vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+ uint16x8_t gyb_hi =
+ vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+ gya_lo = vaddq_u16(
+ gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1])));
+ gya_hi = vaddq_u16(
+ gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1])));
+ gyb_lo = vaddq_u16(
+ gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1])));
+ gyb_hi = vaddq_u16(
+ gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1])));
+
+ uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo);
+ uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh);
+ uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh);
+
+ uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2);
+ uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2);
+
+ uint16x8_t adj0_lo =
+ vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1]));
+ uint16x8_t adj0_hi =
+ vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1]));
+ uint16x8_t adj1_lo =
+ vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2]));
+ uint16x8_t adj1_hi =
+ vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2]));
+ uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo);
+ adj_lo = vaddq_u16(adj_lo, adj_lo);
+ uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi);
+ adj_hi = vaddq_u16(adj_hi, adj_hi);
+
+ uint16x8_t diag0_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+ uint16x8_t diag0_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+ uint16x8_t diag1_lo =
+ vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+ uint16x8_t diag1_hi =
+ vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+ uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo);
+ uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi);
+
+ uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo);
+ v_lo = vabdq_u16(v_lo, adj_lo);
+ uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi);
+ v_hi = vabdq_u16(v_hi, adj_hi);
+
+ acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo));
+ acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi));
+
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo));
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi));
+
+ w += 16;
+ src_ptr += 16;
+ }
+
+ if (w <= (width - 1) - 8) {
+ uint8x8_t mat[3][3];
+ mat[0][0] = vld1_u8(src_ptr - stride - 1);
+ mat[0][1] = vld1_u8(src_ptr - stride);
+ mat[0][2] = vld1_u8(src_ptr - stride + 1);
+ mat[1][0] = vld1_u8(src_ptr - 1);
+ mat[1][1] = vld1_u8(src_ptr);
+ mat[1][2] = vld1_u8(src_ptr + 1);
+ mat[2][0] = vld1_u8(src_ptr + stride - 1);
+ mat[2][1] = vld1_u8(src_ptr + stride);
+ mat[2][2] = vld1_u8(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+ uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vaddq_u16(center, diag);
+ v = vabdq_u16(v, adj);
+
+ acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 8;
+ src_ptr += 8;
+ }
+
+ if (w <= (width - 1) - 4) {
+ uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0));
+ uint8x8_t mat[3][3];
+ mat[0][0] = load_u8_4x1(src_ptr - stride - 1);
+ mat[0][1] = load_u8_4x1(src_ptr - stride);
+ mat[0][2] = load_u8_4x1(src_ptr - stride + 1);
+ mat[1][0] = load_u8_4x1(src_ptr - 1);
+ mat[1][1] = load_u8_4x1(src_ptr);
+ mat[1][2] = load_u8_4x1(src_ptr + 1);
+ mat[2][0] = load_u8_4x1(src_ptr + stride - 1);
+ mat[2][1] = load_u8_4x1(src_ptr + stride);
+ mat[2][2] = load_u8_4x1(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask);
+
+ uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vaddq_u16(center, diag);
+ v = vabdq_u16(v, adj);
+
+ acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 4;
+ src_ptr += 4;
+ }
+
+ while (w < width - 1) {
+ int mat[3][3];
+ mat[0][0] = *(src_ptr - stride - 1);
+ mat[0][1] = *(src_ptr - stride);
+ mat[0][2] = *(src_ptr - stride + 1);
+ mat[1][0] = *(src_ptr - 1);
+ mat[1][1] = *(src_ptr);
+ mat[1][2] = *(src_ptr + 1);
+ mat[2][0] = *(src_ptr + stride - 1);
+ mat[2][1] = *(src_ptr + stride);
+ mat[2][2] = *(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int ga = abs(gx) + abs(gy);
+
+ // Accumulate Laplacian.
+ const int is_under = ga < edge_thresh;
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ final_acc += abs(v) * is_under;
+ final_count += is_under;
+
+ src_ptr++;
+ w++;
+ }
+ src_start += stride;
+ } while (++h < height - 1);
+
+ // We counted negatively, so subtract to get the final value.
+ final_count -= horizontal_add_s32x4(count);
+ final_acc += horizontal_long_add_u32x4(acc);
+ return (final_count < 16)
+ ? -1.0
+ : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
new file mode 100644
index 000000000..5a52e701a
--- /dev/null
+++ b/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// clang-format off
+
+DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+ 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+ 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+ 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+// clang-format on
+
+static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
+ const uint8_t *frame2, const uint32_t stride2,
+ const uint32_t block_width,
+ const uint32_t block_height,
+ uint8_t *frame_abs_diff,
+ const unsigned int dst_stride) {
+ uint8_t *dst = frame_abs_diff;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+ uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ vst1q_u8(dst + j + 2, abs_diff);
+ j += 16;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ } while (++i < block_height);
+}
+
+static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
+ const uint32_t block_width) {
+ uint8x8_t s = vld1_u8(src);
+
+ if (col == 0) {
+ const uint8_t lane2 = vget_lane_u8(s, 2);
+ s = vset_lane_u8(lane2, s, 0);
+ s = vset_lane_u8(lane2, s, 1);
+ } else if (col >= block_width - 4) {
+ const uint8_t lane5 = vget_lane_u8(s, 5);
+ s = vset_lane_u8(lane5, s, 6);
+ s = vset_lane_u8(lane5, s, 7);
+ }
+ return vcombine_u8(s, s);
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+ const uint32_t block_height, const int *subblock_mses,
+ unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW];
+ const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
+
+ // Traverse 4 columns at a time - first and last two columns need padding.
+ for (uint32_t col = 0; col < block_width; col += 4) {
+ uint8x16_t vsrc[5][2];
+ const uint8_t *src = frame_abs_diff + col;
+
+ // Load, pad (for first and last two columns) and mask 3 rows from the top.
+ for (int i = 2; i < 5; i++) {
+ const uint8x16_t s = load_and_pad(src, col, block_width);
+ vsrc[i][0] = vandq_u8(s, vmask.val[0]);
+ vsrc[i][1] = vandq_u8(s, vmask.val[1]);
+ src += SSE_STRIDE;
+ }
+
+ // Pad the top 2 rows.
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+
+ for (unsigned int row = 0; row < block_height; row++) {
+ uint32x4_t sum_01 = vdupq_n_u32(0);
+ uint32x4_t sum_23 = vdupq_n_u32(0);
+
+ sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
+
+ sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
+
+ vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
+
+ // Push all rows in the sliding window up one.
+ for (int i = 0; i < 4; i++) {
+ vsrc[i][0] = vsrc[i + 1][0];
+ vsrc[i][1] = vsrc[i + 1][1];
+ }
+
+ if (row <= block_height - 4) {
+ // Load next row into the bottom of the sliding window.
+ uint8x16_t s = load_and_pad(src, col, block_width);
+ vsrc[4][0] = vandq_u8(s, vmask.val[0]);
+ vsrc[4][1] = vandq_u8(s, vmask.val[1]);
+ src += SSE_STRIDE;
+ } else {
+ // Pad the bottom 2 rows.
+ vsrc[4][0] = vsrc[3][0];
+ vsrc[4][1] = vsrc[3][1];
+ }
+ }
+ }
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_neon_dotprod(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] +=
+ (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+ frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+ }
+ }
+ }
+ }
+ }
+
+ get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_abs_diff, SSE_STRIDE);
+
+ apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_abs_diff, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/av1/encoder/arm/neon/txfm_neon.h b/av1/encoder/arm/neon/txfm_neon.h
new file mode 100644
index 000000000..635364f46
--- /dev/null
+++ b/av1/encoder/arm/neon/txfm_neon.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+
+#include "aom/aom_integer.h" // For AOM_INLINE.
+
+static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip,
+ const int16_t **input,
+ int *stride, int out_size) {
+ if (ud_flip) {
+ *input = *input + (out_size - 1) * *stride;
+ *stride = -*stride;
+ }
+}
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
diff --git a/av1/encoder/arm/neon/wedge_utils_neon.c b/av1/encoder/arm/neon/wedge_utils_neon.c
index 54d8d1911..1b35269b3 100644
--- a/av1/encoder/arm/neon/wedge_utils_neon.c
+++ b/av1/encoder/arm/neon/wedge_utils_neon.c
@@ -75,3 +75,57 @@ uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d,
uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1]));
return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
}
+
+int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ do {
+ int16x8_t ds_l = vld1q_s16(ds);
+ int16x8_t ds_h = vld1q_s16(ds + 8);
+
+ int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m));
+ int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8));
+ int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8));
+
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l));
+ acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h));
+ acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h));
+
+ ds += 16;
+ m += 16;
+ N -= 16;
+ } while (N != 0);
+
+ int64x2_t sum = vpaddlq_s32(acc[0]);
+ sum = vpadalq_s32(sum, acc[1]);
+ sum = vpadalq_s32(sum, acc[2]);
+ sum = vpadalq_s32(sum, acc[3]);
+
+ return (horizontal_add_s64x2(sum) > limit);
+}
+
+void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr,
+ const int16_t *b_ptr, int N) {
+ do {
+ int16x8_t a = vld1q_s16(a_ptr);
+ int16x8_t b = vld1q_s16(b_ptr);
+
+ int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+ int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+
+ sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b));
+ sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b));
+
+ int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi));
+
+ vst1q_s16(d_ptr, res);
+
+ d_ptr += 8;
+ a_ptr += 8;
+ b_ptr += 8;
+ N -= 8;
+ } while (N != 0);
+}
diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
index 4419085be..25007bb6d 100644
--- a/av1/encoder/av1_noise_estimate.c
+++ b/av1/encoder/av1_noise_estimate.c
@@ -34,18 +34,19 @@ static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
#endif
void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+ const int64_t area = (int64_t)width * height;
ne->enabled = 0;
- ne->level = (width * height < 1280 * 720) ? kLowLow : kLow;
+ ne->level = (area < 1280 * 720) ? kLowLow : kLow;
ne->value = 0;
ne->count = 0;
ne->thresh = 90;
ne->last_w = 0;
ne->last_h = 0;
- if (width * height >= 1920 * 1080) {
+ if (area >= 1920 * 1080) {
ne->thresh = 200;
- } else if (width * height >= 1280 * 720) {
+ } else if (area >= 1280 * 720) {
ne->thresh = 140;
- } else if (width * height >= 640 * 360) {
+ } else if (area >= 640 * 360) {
ne->thresh = 115;
}
ne->num_frames_estimate = 15;
@@ -171,7 +172,7 @@ void av1_update_noise_estimate(AV1_COMP *const cpi) {
unsigned int max_bin = 0;
unsigned int max_bin_count = 0;
unsigned int bin_cnt;
- int bsize = BLOCK_16X16;
+ BLOCK_SIZE bsize = BLOCK_16X16;
// Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
// been encoded as zero/small mv at least x consecutive frames, compute
// the variance to update estimate of noise in the source.
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 39aa02701..a9e7978fd 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -43,6 +43,7 @@
#include "av1/encoder/ethread.h"
#include "av1/encoder/mcomp.h"
#include "av1/encoder/palette.h"
+#include "av1/encoder/pickrst.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/tokenize.h"
@@ -64,7 +65,7 @@ static INLINE void write_uniform(aom_writer *w, int n, int v) {
#if !CONFIG_REALTIME_ONLY
static AOM_INLINE void loop_restoration_write_sb_coeffs(
- const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
aom_writer *const w, int plane, FRAME_COUNTS *counts);
#endif
@@ -1027,9 +1028,10 @@ static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
if (uv_mode == UV_CFL_PRED)
write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
- if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (use_angle_delta && av1_is_directional_mode(intra_mode)) {
write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
- ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
+ ec_ctx->angle_delta_cdf[intra_mode - V_PRED]);
}
}
@@ -1621,15 +1623,18 @@ static AOM_INLINE void write_modes_sb(
const int num_planes = av1_num_planes(cm);
for (int plane = 0; plane < num_planes; ++plane) {
int rcol0, rcol1, rrow0, rrow1;
+
+ // Skip some unnecessary work if loop restoration is disabled
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
&rcol0, &rcol1, &rrow0, &rrow1)) {
- const int rstride = cm->rst_info[plane].horz_units_per_tile;
+ const int rstride = cm->rst_info[plane].horz_units;
for (int rrow = rrow0; rrow < rrow1; ++rrow) {
for (int rcol = rcol0; rcol < rcol1; ++rcol) {
const int runit_idx = rcol + rrow * rstride;
- const RestorationUnitInfo *rui =
- &cm->rst_info[plane].unit_info[runit_idx];
- loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, td->counts);
+ loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane,
+ td->counts);
}
}
}
@@ -1913,8 +1918,9 @@ static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
}
static AOM_INLINE void loop_restoration_write_sb_coeffs(
- const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
aom_writer *const w, int plane, FRAME_COUNTS *counts) {
+ const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx];
const RestorationInfo *rsi = cm->rst_info + plane;
RestorationType frame_rtype = rsi->frame_restoration_type;
assert(frame_rtype != RESTORE_NONE);
@@ -1935,9 +1941,21 @@ static AOM_INLINE void loop_restoration_write_sb_coeffs(
#endif
switch (unit_rtype) {
case RESTORE_WIENER:
+#if DEBUG_LR_COSTING
+ assert(!memcmp(
+ ref_wiener_info,
+ &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info,
+ sizeof(*ref_wiener_info)));
+#endif
write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
break;
case RESTORE_SGRPROJ:
+#if DEBUG_LR_COSTING
+ assert(!memcmp(&ref_sgrproj_info->xqd,
+ &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx]
+ .sgrproj_info.xqd,
+ sizeof(ref_sgrproj_info->xqd)));
+#endif
write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
break;
default: assert(unit_rtype == RESTORE_NONE); break;
@@ -1949,6 +1967,12 @@ static AOM_INLINE void loop_restoration_write_sb_coeffs(
++counts->wiener_restore[unit_rtype != RESTORE_NONE];
#endif
if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+ assert(
+ !memcmp(ref_wiener_info,
+ &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info,
+ sizeof(*ref_wiener_info)));
+#endif
write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
}
} else if (frame_rtype == RESTORE_SGRPROJ) {
@@ -1958,6 +1982,12 @@ static AOM_INLINE void loop_restoration_write_sb_coeffs(
++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
#endif
if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+ assert(!memcmp(
+ &ref_sgrproj_info->xqd,
+ &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd,
+ sizeof(ref_sgrproj_info->xqd)));
+#endif
write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
}
}
@@ -3335,7 +3365,7 @@ uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
aom_wb_write_literal(&wb, 0, 1); // forbidden bit.
aom_wb_write_literal(&wb, (int)obu_type, 4);
aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
- aom_wb_write_literal(&wb, 1, 1); // obu_has_payload_length_field
+ aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field
aom_wb_write_literal(&wb, 0, 1); // reserved
if (obu_extension) {
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index 5999f9e3c..12e8a630d 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h
@@ -74,6 +74,9 @@ typedef struct {
// Index of next job to be processed.
int next_job_idx;
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool pack_bs_mt_exit;
} AV1EncPackBSSync;
/*!\endcond */
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 360b9d4f3..33d2d8c2a 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1322,6 +1322,12 @@ typedef struct macroblock {
uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1];
//! Color sensitivity flag for the coding block.
uint8_t color_sensitivity[MAX_MB_PLANE - 1];
+ //! Coding block distortion value for uv/color, minimum over the inter modes.
+ int64_t min_dist_inter_uv;
+
+ //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane
+ // so we can keep dqcoeff of the best tx_type.
+ tran_low_t *dqcoeff_buf;
/**@}*/
/*****************************************************************************
@@ -1330,6 +1336,18 @@ typedef struct macroblock {
/**@{*/
//! Variance of the source frame.
unsigned int source_variance;
+ //! Flag to indicate coding block is zero sad.
+ int block_is_zero_sad;
+ //! Flag to indicate superblock ME in variance partition is determined to be
+ // good/reliable, and so the superblock MV will be tested in the
+ // nonrd_pickmode. This is only used for LAST_FRAME.
+ int sb_me_partition;
+ //! Flag to indicate to test the superblock MV for the coding block in the
+ // nonrd_pickmode.
+ int sb_me_block;
+ //! Motion vector from superblock MV derived from int_pro_motion() in
+ // the variance_partitioning.
+ int_mv sb_me_mv;
//! SSE of the current predictor.
unsigned int pred_sse[REF_FRAMES];
//! Prediction for ML based partition.
@@ -1366,6 +1384,23 @@ typedef struct macroblock {
* fast encoding stage for screen content tool detemination.
*/
int palette_pixels;
+
+ /*!\brief Pointer to the structure which stores the statistics used by
+ * sb-level multi-pass encoding.
+ */
+ struct SB_FIRST_PASS_STATS *sb_stats_cache;
+
+ /*!\brief Pointer to the structure which stores the statistics used by
+ * first-pass when superblock is searched twice consecutively.
+ */
+ struct SB_FIRST_PASS_STATS *sb_fp_stats;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+ /*!\brief Pointer to RD_STATS structure to be used in
+ * av1_rd_partition_search().
+ */
+ RD_STATS *rdcost;
+#endif // CONFIG_PARTITION_SEARCH_ORDER
} MACROBLOCK;
#undef SINGLE_REF_MODES
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 1992f232e..3b0ee8824 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -1024,8 +1024,9 @@ static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
int64_t ref_skip_rd, int mode_rate) {
int eval_txfm = 1;
const int txfm_rd_gate_level =
- get_txfm_rd_gate_level(cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
- TX_SEARCH_DEFAULT, /*eval_motion_mode=*/0);
+ get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
// Check if the mode is good enough based on skip rd
if (txfm_rd_gate_level) {
int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
@@ -1108,8 +1109,9 @@ static int64_t masked_compound_type_rd(
// TODO(nithya): Handle wedge_newmv_search if extending for lower speed
// setting
const int txfm_rd_gate_level =
- get_txfm_rd_gate_level(cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
- TX_SEARCH_DEFAULT, /*eval_motion_mode=*/0);
+ get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
if (txfm_rd_gate_level) {
int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur,
txfm_rd_gate_level, 1);
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 2bd2d7f46..7b8240dda 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -304,8 +304,6 @@ void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
}
void av1_free_sms_tree(ThreadData *td) {
- if (td->sms_tree != NULL) {
- aom_free(td->sms_tree);
- td->sms_tree = NULL;
- }
+ aom_free(td->sms_tree);
+ td->sms_tree = NULL;
}
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 90279b002..878cec59e 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -724,6 +724,7 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
#endif
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
AV1_COMMON *const cm = &cpi->common;
+
GF_GROUP *const gf_group = &cpi->ppi->gf_group;
FRAME_UPDATE_TYPE update_type =
get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
@@ -1658,6 +1659,15 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
}
+ const int is_intra_frame = frame_params.frame_type == KEY_FRAME ||
+ frame_params.frame_type == INTRA_ONLY_FRAME;
+ FeatureFlags *const features = &cm->features;
+ if (!is_stat_generation_stage(cpi) &&
+ (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) &&
+ is_intra_frame) {
+ av1_set_screen_content_options(cpi, features);
+ }
+
#if CONFIG_REALTIME_ONLY
if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
AOM_CODEC_OK) {
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 50f046d75..2c6e49f24 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -523,7 +523,7 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
- PC_TREE *const pc_root = td->rt_pc_root;
+ PC_TREE *const pc_root = td->pc_root;
#if CONFIG_RT_ML_PARTITIONING
if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
@@ -731,9 +731,12 @@ static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td,
av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex;
- PC_TREE *const pc_root = av1_alloc_pc_tree_node(bsize);
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
- &cur_rdc, cur_rdc, pc_root, sms_tree, NULL,
+ &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL,
SB_DRY_PASS, NULL);
if ((rdc_winner.rdcost > cur_rdc.rdcost) ||
@@ -760,6 +763,7 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
const int seg_skip) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
const SPEED_FEATURES *const sf = &cpi->sf;
const TileInfo *const tile_info = &tile_data->tile_info;
MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
@@ -787,11 +791,15 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, rd_use_partition_time);
#endif
- PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
- &dummy_rate, &dummy_dist, 1, pc_root);
- av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
sf->part_sf.partition_search_type);
+ td->pc_root = NULL;
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, rd_use_partition_time);
#endif
@@ -803,20 +811,16 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
const BLOCK_SIZE bsize =
seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
- PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
- &dummy_rate, &dummy_dist, 1, pc_root);
- av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
sf->part_sf.partition_search_type);
+ td->pc_root = NULL;
} else {
- SB_FIRST_PASS_STATS *sb_org_stats = NULL;
-
- if (cpi->oxcf.sb_qp_sweep) {
- CHECK_MEM_ERROR(
- cm, sb_org_stats,
- (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(SB_FIRST_PASS_STATS)));
- av1_backup_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
- }
// The most exhaustive recursive partition search
SuperBlockEnc *sb_enc = &x->sb_enc;
// No stats for overlay frames. Exclude key frame.
@@ -843,12 +847,16 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
cpi->oxcf.gf_cfg.lag_in_frames == 0) &&
cm->delta_q_info.delta_q_present_flag) {
+ AOM_CHECK_MEM_ERROR(
+ x->e_mbd.error_info, td->mb.sb_stats_cache,
+ (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache)));
+ av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+ mi_col);
assert(x->rdmult_delta_qindex == x->delta_qindex);
- assert(sb_org_stats);
const int best_qp_diff =
sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root,
- sb_org_stats) -
+ td->mb.sb_stats_cache) -
x->rdmult_delta_qindex;
sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc,
@@ -859,10 +867,13 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
- av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+ av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+ mi_col);
cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
backup_current_qindex;
+ aom_free(td->mb.sb_stats_cache);
+ td->mb.sb_stats_cache = NULL;
}
if (num_passes == 1) {
#if CONFIG_PARTITION_SEARCH_ORDER
@@ -873,24 +884,36 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row,
mi_col, sb_size, &this_rdc);
} else {
- PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
- &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
- SB_SINGLE_PASS, NULL);
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root,
+ NULL, SB_SINGLE_PASS, NULL);
}
#else
- PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
- &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
SB_SINGLE_PASS, NULL);
#endif // CONFIG_PARTITION_SEARCH_ORDER
} else {
// First pass
- SB_FIRST_PASS_STATS sb_fp_stats;
- av1_backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
- PC_TREE *const pc_root_p0 = av1_alloc_pc_tree_node(sb_size);
+ AOM_CHECK_MEM_ERROR(
+ x->e_mbd.error_info, td->mb.sb_fp_stats,
+ (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats)));
+ av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+ mi_col);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
- &dummy_rdc, dummy_rdc, pc_root_p0, sms_root, NULL,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
SB_DRY_PASS, NULL);
// Second pass
@@ -899,14 +922,19 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
av1_reset_simple_motion_tree_partition(sms_root, sb_size);
- av1_restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+ av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+ mi_col);
- PC_TREE *const pc_root_p1 = av1_alloc_pc_tree_node(sb_size);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
- &dummy_rdc, dummy_rdc, pc_root_p1, sms_root, NULL,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
SB_WET_PASS, NULL);
+ aom_free(td->mb.sb_fp_stats);
+ td->mb.sb_fp_stats = NULL;
}
- aom_free(sb_org_stats);
// Reset to 0 so that it wouldn't be used elsewhere mistakenly.
sb_enc->tpl_data_count = 0;
@@ -1124,6 +1152,17 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
// top-right superblock to finish encoding.
enc_row_mt->sync_read_ptr(
row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi));
+
+#if CONFIG_MULTITHREAD
+ if (row_mt_enabled) {
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ const bool row_mt_exit = enc_row_mt->row_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (row_mt_exit) return;
+ }
+#endif
+
const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
if (update_cdf && (tile_info->mi_row_start != mi_row)) {
if ((tile_info->mi_col_start == mi_col)) {
@@ -1155,6 +1194,9 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
x->content_state_sb.lighting_change = 0;
x->content_state_sb.low_sumdiff = 0;
x->force_zeromv_skip_for_sb = 0;
+ x->sb_me_block = 0;
+ x->sb_me_partition = 0;
+ x->sb_me_mv.as_int = 0;
if (cpi->oxcf.mode == ALLINTRA) {
x->intra_sb_rdmult_modifier = 128;
@@ -1230,7 +1272,7 @@ void av1_alloc_tile_data(AV1_COMP *cpi) {
av1_row_mt_mem_dealloc(cpi);
- if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+ aom_free(cpi->tile_data);
CHECK_MEM_ERROR(
cm, cpi->tile_data,
aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
@@ -1441,7 +1483,7 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
}
}
- av1_dealloc_mb_data(cm, mb);
+ av1_dealloc_mb_data(mb, av1_num_planes(cm));
}
// Set the relative distance of a reference frame w.r.t. current frame
@@ -1670,6 +1712,19 @@ static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) {
}
}
+static void free_block_hash_buffers(uint32_t *block_hash_values[2][2],
+ int8_t *is_block_same[2][3]) {
+ for (int k = 0; k < 2; ++k) {
+ for (int j = 0; j < 2; ++j) {
+ aom_free(block_hash_values[k][j]);
+ }
+
+ for (int j = 0; j < 3; ++j) {
+ aom_free(is_block_same[k][j]);
+ }
+ }
+}
+
/*!\brief Encoder setup(only for the current frame), encoding, and recontruction
* for a single frame
*
@@ -1740,26 +1795,34 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
// add to hash table
const int pic_width = cpi->source->y_crop_width;
const int pic_height = cpi->source->y_crop_height;
- uint32_t *block_hash_values[2][2];
- int8_t *is_block_same[2][3];
+ uint32_t *block_hash_values[2][2] = { { NULL } };
+ int8_t *is_block_same[2][3] = { { NULL } };
int k, j;
+ bool error = false;
- for (k = 0; k < 2; k++) {
- for (j = 0; j < 2; j++) {
- CHECK_MEM_ERROR(cm, block_hash_values[k][j],
- aom_malloc(sizeof(uint32_t) * pic_width * pic_height));
+ for (k = 0; k < 2 && !error; ++k) {
+ for (j = 0; j < 2; ++j) {
+ block_hash_values[k][j] = (uint32_t *)aom_malloc(
+ sizeof(*block_hash_values[0][0]) * pic_width * pic_height);
+ if (!block_hash_values[k][j]) {
+ error = true;
+ break;
+ }
}
- for (j = 0; j < 3; j++) {
- CHECK_MEM_ERROR(cm, is_block_same[k][j],
- aom_malloc(sizeof(int8_t) * pic_width * pic_height));
+ for (j = 0; j < 3 && !error; ++j) {
+ is_block_same[k][j] = (int8_t *)aom_malloc(
+ sizeof(*is_block_same[0][0]) * pic_width * pic_height);
+ if (!is_block_same[k][j]) error = true;
}
}
av1_hash_table_init(intrabc_hash_info);
- if (!av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+ if (error ||
+ !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+ free_block_hash_buffers(block_hash_values, is_block_same);
aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
- "Error allocating intrabc_hash_table");
+ "Error allocating intrabc_hash_table and buffers");
}
hash_table_created = 1;
av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
@@ -1769,7 +1832,6 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
const int max_sb_size =
(1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
int src_idx = 0;
- bool error = false;
for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
const int dst_idx = !src_idx;
av1_generate_block_hash_value(
@@ -1787,15 +1849,7 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
}
}
- for (k = 0; k < 2; k++) {
- for (j = 0; j < 2; j++) {
- aom_free(block_hash_values[k][j]);
- }
-
- for (j = 0; j < 3; j++) {
- aom_free(is_block_same[k][j]);
- }
- }
+ free_block_hash_buffers(block_hash_values, is_block_same);
if (error) {
aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
@@ -1957,12 +2011,19 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
// Preallocate the pc_tree for realtime coding to reduce the cost of
// memory allocation.
const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
- td->rt_pc_root = use_nonrd_mode
- ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
- : NULL;
+ if (use_nonrd_mode) {
+ td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ td->pc_root = NULL;
+ }
+
encode_tiles(cpi);
- av1_free_pc_tree_recursive(td->rt_pc_root, av1_num_planes(cm), 0, 0,
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
}
}
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 29d7fe4df..94298c839 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -22,7 +22,7 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
const int mi_col, int *const rdmult) {
const AV1_COMMON *const cm = &cpi->common;
- const int bsize_base = BLOCK_16X16;
+ const BLOCK_SIZE bsize_base = BLOCK_16X16;
const int num_mi_w = mi_size_wide[bsize_base];
const int num_mi_h = mi_size_high[bsize_base];
const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
@@ -177,7 +177,7 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
const int block_mi_width_sr =
coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
- const int bsize_base = BLOCK_16X16;
+ const BLOCK_SIZE bsize_base = BLOCK_16X16;
const int num_mi_w = mi_size_wide[bsize_base];
const int num_mi_h = mi_size_high[bsize_base];
const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
@@ -588,13 +588,13 @@ void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
}
}
- if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
- av1_use_angle_delta(bsize)) {
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) {
#if CONFIG_ENTROPY_STATS
- ++counts->angle_delta[uv_mode - UV_V_PRED]
+ ++counts->angle_delta[intra_mode - V_PRED]
[mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
#endif
- update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+ update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED],
mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
2 * MAX_ANGLE_DELTA + 1);
}
@@ -1743,3 +1743,23 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
default: assert(0);
}
}
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ aom_free(mb->plane[plane].src_diff);
+ mb->plane[plane].src_diff = NULL;
+ }
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) {
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_xy =
+ plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+ : 0;
+ const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+ CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
+ (int16_t *)aom_memalign(
+ 32, sizeof(*mb->plane[plane].src_diff) * sb_size));
+ }
+}
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 24a36c590..14c71b880 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -430,25 +430,26 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile_info, const int mi_row,
const int mi_col);
-static AOM_INLINE void av1_dealloc_mb_data(struct AV1Common *cm,
- struct macroblock *mb) {
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes);
+
+static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb,
+ int num_planes) {
aom_free(mb->txfm_search_info.mb_rd_record);
mb->txfm_search_info.mb_rd_record = NULL;
aom_free(mb->inter_modes_info);
mb->inter_modes_info = NULL;
- const int num_planes = av1_num_planes(cm);
- for (int plane = 0; plane < num_planes; plane++) {
- aom_free(mb->plane[plane].src_diff);
- mb->plane[plane].src_diff = NULL;
- }
+ av1_dealloc_src_diff_buf(mb, num_planes);
aom_free(mb->e_mbd.seg_mask);
mb->e_mbd.seg_mask = NULL;
aom_free(mb->winner_mode_stats);
mb->winner_mode_stats = NULL;
+
+ aom_free(mb->dqcoeff_buf);
+ mb->dqcoeff_buf = NULL;
}
static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
@@ -468,6 +469,8 @@ static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
winner_mode_count * sizeof(mb->winner_mode_stats[0])));
}
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb);
+
static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
struct macroblock *mb) {
const AV1_COMMON *cm = &cpi->common;
@@ -483,21 +486,20 @@ static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
cm, mb->inter_modes_info,
(InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info)));
}
- const int num_planes = av1_num_planes(cm);
- for (int plane = 0; plane < num_planes; plane++) {
- const int subsampling_xy =
- plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
- : 0;
- const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
- CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
- (int16_t *)aom_memalign(
- 32, sizeof(*mb->plane[plane].src_diff) * sb_size));
- }
+
+ av1_alloc_src_diff_buf(cm, mb);
+
CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask,
(uint8_t *)aom_memalign(
16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0])));
allocate_winner_mode_stats(cpi, mb);
+
+ const int max_sb_square_y = 1
+ << num_pels_log2_lookup[cm->seq_params->sb_size];
+ CHECK_MEM_ERROR(
+ cm, mb->dqcoeff_buf,
+ (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t)));
}
// This function will compute the number of reference frames to be disabled
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 78efa0c3b..c78761dd9 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -449,10 +449,16 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
av1_set_txb_context(x, plane, block, tx_size, a, l);
if (p->eobs[block]) {
- *(args->skip) = 0;
+ // As long as any YUV plane has non-zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 0.
+ mbmi->skip_txfm = 0;
av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
pd->dst.stride, p->eobs[block],
cm->features.reduced_tx_set_used);
+ } else {
+ // Only when YUV planes all have zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 1.
+ mbmi->skip_txfm &= 1;
}
// TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
@@ -650,13 +656,19 @@ void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
assert(bsize < BLOCK_SIZES_ALL);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
+ // In the current encoder implementation, for inter blocks,
+ // only when YUV planes all have zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 1.
+ // For intra blocks, this flag is set to 0 since skipped blocks are so rare
+ // that transmitting skip_txfm = 1 is very expensive.
+ // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based
+ // on transform, quantization, and (if exists) trellis optimization.
mbmi->skip_txfm = 1;
if (x->txfm_search_info.skip_txfm) return;
struct optimize_ctx ctx;
struct encode_b_args arg = {
- cpi, x, &ctx, &mbmi->skip_txfm,
- NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
+ cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
};
const AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
@@ -727,6 +739,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
@@ -820,9 +833,9 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
}
- // For intra mode, skipped blocks are so rare that transmitting skip=1 is
- // very expensive.
- *(args->skip) = 0;
+ // For intra mode, skipped blocks are so rare that transmitting
+ // skip_txfm = 1 is very expensive.
+ mbmi->skip_txfm = 0;
if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
@@ -841,8 +854,9 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
const int ss_y = pd->subsampling_y;
ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
- struct encode_b_args arg = { cpi, x, NULL, &(xd->mi[0]->skip_txfm),
- ta, tl, dry_run, enable_optimize_b };
+ struct encode_b_args arg = {
+ cpi, x, NULL, ta, tl, dry_run, enable_optimize_b
+ };
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
if (enable_optimize_b) {
av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index b819e8244..f97bf8f51 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -56,7 +56,6 @@ struct encode_b_args {
const struct AV1_COMP *cpi;
MACROBLOCK *x;
struct optimize_ctx *ctx;
- uint8_t *skip;
ENTROPY_CONTEXT *ta;
ENTROPY_CONTEXT *tl;
RUN_TYPE dry_run;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index e1cb49b80..07b6ffebe 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -236,16 +236,16 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm,
size_t encoded_frame_size) {
const int upscaled_width = cm->superres_upscaled_width;
const int height = cm->height;
- const int luma_pic_size = upscaled_width * height;
+ const int64_t luma_pic_size = (int64_t)upscaled_width * height;
const SequenceHeader *const seq_params = cm->seq_params;
const BITSTREAM_PROFILE profile = seq_params->profile;
const int pic_size_profile_factor =
profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
encoded_frame_size =
(encoded_frame_size > 129 ? encoded_frame_size - 128 : 1);
- const size_t uncompressed_frame_size =
+ const int64_t uncompressed_frame_size =
(luma_pic_size * pic_size_profile_factor) >> 3;
- return uncompressed_frame_size / (double)encoded_frame_size;
+ return (double)uncompressed_frame_size / encoded_frame_size;
}
static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs,
@@ -362,9 +362,9 @@ void av1_update_frame_size(AV1_COMP *cpi) {
static INLINE int does_level_match(int width, int height, double fps,
int lvl_width, int lvl_height,
double lvl_fps, int lvl_dim_mult) {
- const int64_t lvl_luma_pels = lvl_width * lvl_height;
+ const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height;
const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
- const int64_t luma_pels = width * height;
+ const int64_t luma_pels = (int64_t)width * height;
const double display_sample_rate = luma_pels * fps;
return luma_pels <= lvl_luma_pels &&
display_sample_rate <= lvl_display_sample_rate &&
@@ -642,6 +642,12 @@ static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
cm->height = oxcf->frm_dim_cfg.height;
cpi->is_dropped_frame = false;
+ InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
+ initial_dimensions->width = cm->width;
+ initial_dimensions->height = cm->height;
+
+ cpi->frame_size_related_setup_done = false;
+
alloc_compressor_data(cpi);
// Single thread case: use counts in common.
@@ -916,14 +922,14 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
cpi->td.firstpass_ctx = NULL;
alloc_compressor_data(cpi);
realloc_segmentation_maps(cpi);
- initial_dimensions->width = initial_dimensions->height = 0;
+ initial_dimensions->width = cm->width;
+ initial_dimensions->height = cm->height;
+ cpi->frame_size_related_setup_done = false;
}
av1_update_frame_size(cpi);
rc->is_src_frame_alt_ref = 0;
- set_tile_info(cm, &cpi->oxcf.tile_cfg);
-
if (!cpi->ppi->rtc_ref.set_ref_frame_config)
cpi->ext_flags.refresh_frame.update_pending = 0;
cpi->ext_flags.refresh_frame_context_pending = 0;
@@ -1278,7 +1284,7 @@ AV1_PRIMARY *av1_create_primary_compressor(
enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
BLOCK_4X4);
- const int bsize = BLOCK_16X16;
+ const BLOCK_SIZE bsize = BLOCK_16X16;
const int w = mi_size_wide[bsize];
const int h = mi_size_high[bsize];
const int num_cols = (mi_params.mi_cols + w - 1) / w;
@@ -1410,6 +1416,8 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
cm->current_frame.frame_number = 0;
cpi->rc.frame_number_encoded = 0;
cpi->rc.prev_frame_is_dropped = 0;
+ cpi->rc.max_consec_drop = INT_MAX;
+ cpi->rc.drop_count_consec = 0;
cm->current_frame_id = -1;
cpi->tile_data = NULL;
cpi->last_show_frame_buf = NULL;
@@ -1495,7 +1503,7 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
cpi->palette_pixel_num = 0;
{
- const int bsize = BLOCK_16X16;
+ const BLOCK_SIZE bsize = BLOCK_16X16;
const int w = mi_size_wide[bsize];
const int h = mi_size_high[bsize];
const int num_cols = (max_mi_cols + w - 1) / w;
@@ -1510,7 +1518,7 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
#if CONFIG_TUNE_VMAF
{
- const int bsize = BLOCK_64X64;
+ const BLOCK_SIZE bsize = BLOCK_64X64;
const int w = mi_size_wide[bsize];
const int h = mi_size_high[bsize];
const int num_cols = (mi_params->mi_cols + w - 1) / w;
@@ -1612,17 +1620,6 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
#endif // CONFIG_INTERNAL_STATS
-// This function will change the state and free the mutex of corresponding
-// workers and terminate the object. The object can not be re-used unless a call
-// to reset() is made.
-static AOM_INLINE void terminate_worker_data(AV1_PRIMARY *ppi) {
- PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
- for (int t = p_mt_info->num_workers - 1; t >= 0; --t) {
- AVxWorker *const worker = &p_mt_info->workers[t];
- aom_get_worker_interface()->end(worker);
- }
-}
-
void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
if (!ppi) return;
#if !CONFIG_REALTIME_ONLY
@@ -1650,11 +1647,14 @@ void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
#endif
- terminate_worker_data(ppi);
+ av1_terminate_workers(ppi);
free_thread_data(ppi);
aom_free(ppi->p_mt_info.tile_thr_data);
+ ppi->p_mt_info.tile_thr_data = NULL;
aom_free(ppi->p_mt_info.workers);
+ ppi->p_mt_info.workers = NULL;
+ ppi->p_mt_info.num_workers = 0;
aom_free(ppi);
}
@@ -1699,6 +1699,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_;
pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+ pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_;
pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
if (enc_row_mt_mutex_ != NULL) {
pthread_mutex_destroy(enc_row_mt_mutex_);
@@ -1712,6 +1713,10 @@ void av1_remove_compressor(AV1_COMP *cpi) {
pthread_mutex_destroy(gm_mt_mutex_);
aom_free(gm_mt_mutex_);
}
+ if (tpl_error_mutex_ != NULL) {
+ pthread_mutex_destroy(tpl_error_mutex_);
+ aom_free(tpl_error_mutex_);
+ }
if (pack_bs_mt_mutex_ != NULL) {
pthread_mutex_destroy(pack_bs_mt_mutex_);
aom_free(pack_bs_mt_mutex_);
@@ -1720,11 +1725,11 @@ void av1_remove_compressor(AV1_COMP *cpi) {
av1_row_mt_mem_dealloc(cpi);
if (mt_info->num_workers > 1) {
+ av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
av1_loop_filter_dealloc(&mt_info->lf_row_sync);
av1_cdef_mt_dealloc(&mt_info->cdef_sync);
#if !CONFIG_REALTIME_ONLY
av1_loop_restoration_dealloc(&mt_info->lr_row_sync);
- av1_gm_dealloc(&mt_info->gm_sync);
av1_tf_mt_dealloc(&mt_info->tf_sync);
#endif
}
@@ -1950,6 +1955,7 @@ void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
const int stride = cpi->unfiltered_source->y_stride;
const int width = cpi->unfiltered_source->y_width;
const int height = cpi->unfiltered_source->y_height;
+ const int64_t area = (int64_t)width * height;
const int bd = cm->seq_params->bit_depth;
const int blk_w = 16;
const int blk_h = 16;
@@ -1957,10 +1963,10 @@ void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
const int color_thresh = 4;
const unsigned int var_thresh = 0;
// Counts of blocks with no more than color_thresh colors.
- int counts_1 = 0;
+ int64_t counts_1 = 0;
// Counts of blocks with no more than color_thresh colors and variance larger
// than var_thresh.
- int counts_2 = 0;
+ int64_t counts_2 = 0;
for (int r = 0; r + blk_h <= height; r += blk_h) {
for (int c = 0; c + blk_w <= width; c += blk_w) {
@@ -1985,17 +1991,15 @@ void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
}
// The threshold values are selected experimentally.
- features->allow_screen_content_tools =
- counts_1 * blk_h * blk_w * 10 > width * height;
+ features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area;
// IntraBC would force loop filters off, so we use more strict rules that also
// requires that the block has high variance.
features->allow_intrabc = features->allow_screen_content_tools &&
- counts_2 * blk_h * blk_w * 12 > width * height;
+ counts_2 * blk_h * blk_w * 12 > area;
cpi->use_screen_content_tools = features->allow_screen_content_tools;
cpi->is_screen_content_type =
- features->allow_intrabc ||
- (counts_1 * blk_h * blk_w * 10 > width * height * 4 &&
- counts_2 * blk_h * blk_w * 30 > width * height);
+ features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 &&
+ counts_2 * blk_h * blk_w * 30 > area);
}
static void init_motion_estimation(AV1_COMP *cpi) {
@@ -2044,29 +2048,6 @@ static void init_motion_estimation(AV1_COMP *cpi) {
}
}
-#if !CONFIG_REALTIME_ONLY
-#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
-static void set_restoration_unit_size(int width, int height, int sx, int sy,
- RestorationInfo *rst) {
- (void)width;
- (void)height;
- (void)sx;
- (void)sy;
-#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
- int s = AOMMIN(sx, sy);
-#else
- int s = 0;
-#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
-
- if (width * height > 352 * 288)
- rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX;
- else
- rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
- rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
- rst[2].restoration_unit_size = rst[1].restoration_unit_size;
-}
-#endif
-
static void init_ref_frame_bufs(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
int i;
@@ -2088,13 +2069,15 @@ static void init_ref_frame_bufs(AV1_COMP *cpi) {
#endif
}
+// TODO(chengchen): consider renaming this function as it is necessary
+// for the encoder to setup critical parameters, and it does not
+// deal with initial width any longer.
void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
int subsampling_x, int subsampling_y) {
AV1_COMMON *const cm = &cpi->common;
SequenceHeader *const seq_params = cm->seq_params;
- InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
- if (!initial_dimensions->width ||
+ if (!cpi->frame_size_related_setup_done ||
seq_params->use_highbitdepth != use_highbitdepth ||
seq_params->subsampling_x != subsampling_x ||
seq_params->subsampling_y != subsampling_y) {
@@ -2114,9 +2097,8 @@ void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
init_motion_estimation(cpi); // TODO(agrange) This can be removed.
- initial_dimensions->width = cm->width;
- initial_dimensions->height = cm->height;
cpi->initial_mbs = cm->mi_params.MBs;
+ cpi->frame_size_related_setup_done = true;
}
}
@@ -2153,18 +2135,18 @@ int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
setup_denoiser_buffer(cpi);
#endif
- if (initial_dimensions->width && initial_dimensions->height &&
- (cm->width > initial_dimensions->width ||
- cm->height > initial_dimensions->height)) {
+ if (cm->width > initial_dimensions->width ||
+ cm->height > initial_dimensions->height) {
av1_free_context_buffers(cm);
av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
av1_free_sms_tree(&cpi->td);
av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
cpi->td.firstpass_ctx = NULL;
- alloc_mb_mode_info_buffers(cpi);
alloc_compressor_data(cpi);
realloc_segmentation_maps(cpi);
- initial_dimensions->width = initial_dimensions->height = 0;
+ initial_dimensions->width = cm->width;
+ initial_dimensions->height = cm->height;
+ cpi->frame_size_related_setup_done = false;
}
alloc_mb_mode_info_buffers(cpi);
av1_update_frame_size(cpi);
@@ -2232,11 +2214,6 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
#if !CONFIG_REALTIME_ONLY
if (is_restoration_used(cm)) {
- const int frame_width = cm->superres_upscaled_width;
- const int frame_height = cm->superres_upscaled_height;
- set_restoration_unit_size(frame_width, frame_height,
- seq_params->subsampling_x,
- seq_params->subsampling_y, cm->rst_info);
for (int i = 0; i < num_planes; ++i)
cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
@@ -2251,6 +2228,7 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
init_motion_estimation(cpi);
+ int has_valid_ref_frame = 0;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
if (buf != NULL) {
@@ -2258,9 +2236,15 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width,
buf->buf.y_crop_height, cm->width,
cm->height);
+ has_valid_ref_frame |= av1_is_valid_scale(sf);
if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes);
}
}
+ if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+ aom_internal_error(
+ cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Can't find at least one reference frame with valid size");
+ }
av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
cm->width, cm->height);
@@ -2310,18 +2294,8 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
start_timing(cpi, cdef_time);
#endif
const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
- const int use_screen_content_model =
- cm->quant_params.base_qindex >
- AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh,
- cpi->rc.best_quality + 5) &&
- cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
// Find CDEF parameters
- av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd,
- cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult,
- cpi->sf.rt_sf.skip_cdef_sb, cpi->oxcf.tool_cfg.cdef_control,
- use_screen_content_model,
- cpi->ppi->rtc_ref.non_reference_frame,
- cpi->rc.rtc_external_ratectrl);
+ av1_cdef_search(cpi);
// Apply the filter
if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) {
@@ -2614,14 +2588,18 @@ static int encode_without_recode(AV1_COMP *cpi) {
if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
const YV12_BUFFER_CONFIG *const ref =
get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
- if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+ if (ref == NULL || ref->y_crop_width != cm->width ||
+ ref->y_crop_height != cm->height) {
cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+ }
}
if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
const YV12_BUFFER_CONFIG *const ref =
get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
- if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+ if (ref == NULL || ref->y_crop_width != cm->width ||
+ ref->y_crop_height != cm->height) {
cpi->ref_frame_flags ^= AOM_ALT_FLAG;
+ }
}
}
@@ -2632,16 +2610,9 @@ static int encode_without_recode(AV1_COMP *cpi) {
#endif // CONFIG_FPMT_TEST
if (scale_references ||
cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
- // For SVC the inter-layer/spatial prediction is not done for newmv
- // (zero_mode is forced), and since the scaled references are only
- // use for newmv search, we can avoid scaling here when
- // force_zero_mode_spatial_ref is set for SVC mode.
- // Also add condition for dynamic_resize: for dynamic_resize we always
- // check for scaling references for now.
- if (!frame_is_intra_only(cm) &&
- (!cpi->ppi->use_svc || !cpi->svc.force_zero_mode_spatial_ref ||
- cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC))
+ if (!frame_is_intra_only(cm)) {
av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+ }
}
av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
@@ -3051,7 +3022,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
rc->projected_frame_size, psnr.sse[0]);
++rd_command->frame_index;
if (rd_command->frame_index == rd_command->frame_count) {
- exit(0);
+ return AOM_CODEC_ERROR;
}
#endif // CONFIG_RD_COMMAND
@@ -3585,10 +3556,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
start_timing(cpi, encode_frame_to_data_rate_time);
#endif
- if (frame_is_intra_only(cm)) {
- av1_set_screen_content_options(cpi, features);
- }
-
#if !CONFIG_REALTIME_ONLY
calculate_frame_avg_haar_energy(cpi);
#endif
@@ -3736,19 +3703,14 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
cpi->num_tg = DEFAULT_MAX_NUM_TG;
}
- // For 1 pass CBR, check if we are dropping this frame.
- // Never drop on key frame, or for frame whose base layer is key.
- if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR &&
- current_frame->frame_type != KEY_FRAME &&
- !(cpi->ppi->use_svc &&
- cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
- FRAME_UPDATE_TYPE update_type =
- cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
- (void)update_type;
- assert(
- IMPLIES(cpi->is_dropped_frame, (update_type == OVERLAY_UPDATE ||
- update_type == INTNL_OVERLAY_UPDATE)));
- if (av1_rc_drop_frame(cpi)) {
+ // For 1 pass CBR mode: check if we are dropping this frame.
+ if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) {
+ // Always drop for spatial enhancement layer if layer bandwidth is 0.
+ // Otherwise check for frame-dropping based on buffer level in
+ // av1_rc_drop_frame().
+ if ((cpi->svc.spatial_layer_id > 0 &&
+ cpi->oxcf.rc_cfg.target_bandwidth == 0) ||
+ av1_rc_drop_frame(cpi)) {
cpi->is_dropped_frame = true;
}
if (cpi->is_dropped_frame) {
@@ -4438,10 +4400,8 @@ void print_internal_stats(AV1_PRIMARY *ppi) {
fclose(f);
- if (ppi->ssim_vars != NULL) {
- aom_free(ppi->ssim_vars);
- ppi->ssim_vars = NULL;
- }
+ aom_free(ppi->ssim_vars);
+ ppi->ssim_vars = NULL;
}
}
#endif // CONFIG_INTERNAL_STATS
@@ -4481,7 +4441,16 @@ static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
// Increment the gf group index ready for the next frame.
- ++cpi->gf_frame_index;
+ if (is_one_pass_rt_params(cpi) &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ ++cpi->gf_frame_index;
+ // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH
+ // for real time encoding.
+ if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH)
+ cpi->gf_frame_index = 0;
+ } else {
+ ++cpi->gf_frame_index;
+ }
}
static void update_fb_of_context_type(const AV1_COMP *const cpi,
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 9b2b4aea7..0a8bcdec2 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1442,6 +1442,8 @@ typedef struct RD_COUNTS {
typedef struct ThreadData {
MACROBLOCK mb;
+ MvCosts *mv_costs_alloc;
+ IntraBCMVCosts *dv_costs_alloc;
RD_COUNTS rd_counts;
FRAME_COUNTS *counts;
PC_TREE_SHARED_BUFFERS shared_coeff_buf;
@@ -1454,6 +1456,7 @@ typedef struct ThreadData {
CONV_BUF_TYPE *tmp_conv_dst;
uint64_t abs_sum_level;
uint8_t *tmp_pred_bufs[2];
+ uint8_t *wiener_tmp_pred_buf;
int intrabc_used;
int deltaq_used;
int coefficient_size;
@@ -1464,7 +1467,9 @@ typedef struct ThreadData {
int32_t num_64x64_blocks;
PICK_MODE_CONTEXT *firstpass_ctx;
TemporalFilterData tf_data;
+ TplBuffers tpl_tmp_buffers;
TplTxfmStats tpl_txfm_stats;
+ GlobalMotionData gm_data;
// Pointer to the array of structures to store gradient information of each
// pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
// structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
@@ -1474,8 +1479,8 @@ typedef struct ThreadData {
// store source variance and log of source variance of each 4x4 sub-block
// for subsequent retrieval.
Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
- // The pc tree root for RTC non-rd case.
- PC_TREE *rt_pc_root;
+ // Pointer to pc tree root.
+ PC_TREE *pc_root;
} ThreadData;
struct EncWorkerData;
@@ -1526,6 +1531,19 @@ typedef struct {
*/
int allocated_sb_rows;
+ /*!
+ * Initialized to false, set to true by the worker thread that encounters an
+ * error in order to abort the processing of other worker threads.
+ */
+ bool row_mt_exit;
+
+ /*!
+ * Initialized to false, set to true during first pass encoding by the worker
+ * thread that encounters an error in order to abort the processing of other
+ * worker threads.
+ */
+ bool firstpass_mt_exit;
+
#if CONFIG_MULTITHREAD
/*!
* Mutex lock used while dispatching jobs.
@@ -1619,6 +1637,45 @@ typedef struct RestoreStateBuffers {
} RestoreStateBuffers;
/*!
+ * \brief Parameters related to restoration types.
+ */
+typedef struct {
+ /*!
+ * Stores the best coefficients for Wiener restoration.
+ */
+ WienerInfo wiener;
+
+ /*!
+ * Stores the best coefficients for Sgrproj restoration.
+ */
+ SgrprojInfo sgrproj;
+
+ /*!
+ * The rtype to use for this unit given a frame rtype as index. Indices:
+ * WIENER, SGRPROJ, SWITCHABLE.
+ */
+ RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+/*!
+ * \brief Structure to hold search parameter per restoration unit and
+ * intermediate buffer of Wiener filter used in pick filter stage of Loop
+ * restoration.
+ */
+typedef struct {
+ /*!
+ * Array of pointers to 'RestUnitSearchInfo' which holds data related to
+ * restoration types.
+ */
+ RestUnitSearchInfo *rusi[MAX_MB_PLANE];
+
+ /*!
+ * Buffer used to hold dgd-avg data during SIMD call of Wiener filter.
+ */
+ int16_t *dgd_avg;
+} AV1LrPickStruct;
+
+/*!
* \brief Primary Encoder parameters related to multi-threading.
*/
typedef struct PrimaryMultiThreadInfo {
@@ -2938,6 +2995,11 @@ typedef struct AV1_COMP {
TemporalFilterCtx tf_ctx;
/*!
+ * Pointer to CDEF search context.
+ */
+ CdefSearchCtx *cdef_search_ctx;
+
+ /*!
* Variables related to forcing integer mv decisions for the current frame.
*/
ForceIntegerMVInfo force_intpel_info;
@@ -3101,7 +3163,9 @@ typedef struct AV1_COMP {
FRAME_INDEX_SET frame_index_set;
/*!
- * Structure to store the dimensions of current frame.
+ * Structure to store the cm->width and cm->height in the last call
+ * of alloc_compressor_data().
+ * TODO(chengchen): rename this variable or delete it.
*/
InitialDimensions initial_dimensions;
@@ -3114,6 +3178,24 @@ typedef struct AV1_COMP {
int initial_mbs;
/*!
+ * Flag to indicate whether the frame size inforamation has been
+ * setup and propagated to associated allocations.
+ */
+ bool frame_size_related_setup_done;
+
+ /*!
+ * The width of the frame that is lastly encoded.
+ * It is updated in the function "encoder_encode()".
+ */
+ int last_coded_width;
+
+ /*!
+ * The height of the frame that is lastly encoded.
+ * It is updated in the function "encoder_encode()".
+ */
+ int last_coded_height;
+
+ /*!
* Resize related parameters.
*/
ResizePendingParams resize_pending_params;
@@ -3220,6 +3302,11 @@ typedef struct AV1_COMP {
AV1LrStruct lr_ctxt;
/*!
+ * Loop Restoration context used during pick stage.
+ */
+ AV1LrPickStruct pick_lr_ctxt;
+
+ /*!
* Pointer to list of tables with film grain parameters.
*/
aom_film_grain_table_t *film_grain_table;
@@ -3515,6 +3602,8 @@ typedef struct AV1_COMP {
/*!
* SSE between the current frame and the reconstructed last frame
+ * It is only used for CBR mode.
+ * It is not used if the reference frame has a different frame size.
*/
uint64_t rec_sse;
@@ -3782,6 +3871,10 @@ int av1_get_quantizer(struct AV1_COMP *cpi);
int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td);
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td);
+
// Set screen content options.
// This function estimates whether to use screen content tools, by counting
// the portion of blocks that have few luma colors.
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 9a1d60f20..d0fd78266 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -13,10 +13,13 @@
#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
#include "av1/encoder/block.h"
+#include "av1/encoder/encodeframe_utils.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/encodetxb.h"
#include "av1/encoder/ethread.h"
+#include "av1/encoder/global_motion_facade.h"
#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/pickcdef.h"
#ifdef __cplusplus
extern "C" {
@@ -24,11 +27,9 @@ extern "C" {
static AOM_INLINE void dealloc_context_buffers_ext(
MBMIExtFrameBufferInfo *mbmi_ext_info) {
- if (mbmi_ext_info->frame_base) {
- aom_free(mbmi_ext_info->frame_base);
- mbmi_ext_info->frame_base = NULL;
- mbmi_ext_info->alloc_size = 0;
- }
+ aom_free(mbmi_ext_info->frame_base);
+ mbmi_ext_info->frame_base = NULL;
+ mbmi_ext_info->alloc_size = 0;
}
static AOM_INLINE void alloc_context_buffers_ext(
@@ -64,14 +65,14 @@ static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi);
- if (cpi->td.mb.mv_costs) {
- aom_free(cpi->td.mb.mv_costs);
- cpi->td.mb.mv_costs = NULL;
- }
- // Avoid the memory allocation of 'mv_costs' for allintra encoding mode.
+ aom_free(cpi->td.mv_costs_alloc);
+ cpi->td.mv_costs_alloc = NULL;
+ // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding
+ // mode.
if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
- CHECK_MEM_ERROR(cm, cpi->td.mb.mv_costs,
- (MvCosts *)aom_calloc(1, sizeof(MvCosts)));
+ CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc,
+ (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc)));
+ cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc;
}
av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf,
@@ -79,6 +80,9 @@ static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
av1_setup_sms_tree(cpi, &cpi->td);
cpi->td.firstpass_ctx =
av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+ if (!cpi->td.firstpass_ctx)
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
}
// Allocate mbmi buffers which are used to store mode information at block
@@ -178,7 +182,7 @@ static AOM_INLINE void release_compound_type_rd_buffers(
static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
TokenInfo *token_info = &cpi->token_info;
-
+ const int num_planes = av1_num_planes(cm);
dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
aom_free(cpi->tile_data);
@@ -220,15 +224,25 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
release_obmc_buffers(&cpi->td.mb.obmc_buffer);
- if (cpi->td.mb.mv_costs) {
- aom_free(cpi->td.mb.mv_costs);
- cpi->td.mb.mv_costs = NULL;
- }
+ aom_free(cpi->td.mv_costs_alloc);
+ cpi->td.mv_costs_alloc = NULL;
+ aom_free(cpi->td.dv_costs_alloc);
+ cpi->td.dv_costs_alloc = NULL;
- if (cpi->td.mb.dv_costs) {
- aom_free(cpi->td.mb.dv_costs);
- cpi->td.mb.dv_costs = NULL;
- }
+ aom_free(cpi->td.mb.sb_stats_cache);
+ cpi->td.mb.sb_stats_cache = NULL;
+
+ aom_free(cpi->td.mb.sb_fp_stats);
+ cpi->td.mb.sb_fp_stats = NULL;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+ aom_free(cpi->td.mb.rdcost);
+ cpi->td.mb.rdcost = NULL;
+#endif
+
+ av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ cpi->td.pc_root = NULL;
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++) {
@@ -236,33 +250,55 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
}
+ av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table);
+
aom_free(cm->tpl_mvs);
cm->tpl_mvs = NULL;
- if (cpi->td.pixel_gradient_info) {
- aom_free(cpi->td.pixel_gradient_info);
- cpi->td.pixel_gradient_info = NULL;
- }
+ aom_free(cpi->td.pixel_gradient_info);
+ cpi->td.pixel_gradient_info = NULL;
- if (cpi->td.src_var_info_of_4x4_sub_blocks) {
- aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
- cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
- }
+ aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
+ cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
- if (cpi->td.vt64x64) {
- aom_free(cpi->td.vt64x64);
- cpi->td.vt64x64 = NULL;
- }
+ aom_free(cpi->td.vt64x64);
+ cpi->td.vt64x64 = NULL;
- av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+ av1_free_pmc(cpi->td.firstpass_ctx, num_planes);
cpi->td.firstpass_ctx = NULL;
+ const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+ // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+ // in av1_temporal_filter() for single-threaded encode are freed in case an
+ // error is encountered during temporal filtering (due to early termination
+ // tf_dealloc_data() in av1_temporal_filter() would not be invoked).
+ tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth);
+
+ // This call ensures that tpl_tmp_buffers for single-threaded encode are freed
+ // in case of an error during tpl.
+ tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers);
+
+ // This call ensures that the global motion (gm) data buffers for
+ // single-threaded encode are freed in case of an error during gm.
+ gm_dealloc_data(&cpi->td.gm_data);
+
+ // This call ensures that CDEF search context buffers are deallocated in case
+ // of an error during cdef search.
+ av1_cdef_dealloc_data(cpi->cdef_search_ctx);
+ aom_free(cpi->cdef_search_ctx);
+ cpi->cdef_search_ctx = NULL;
+
+ av1_dealloc_mb_data(&cpi->td.mb, num_planes);
+
+ av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+
av1_free_txb_buf(cpi);
av1_free_context_buffers(cm);
aom_free_frame_buffer(&cpi->last_frame_uf);
#if !CONFIG_REALTIME_ONLY
av1_free_restoration_buffers(cm);
+ av1_free_firstpass_data(&cpi->firstpass_data);
#endif
if (!is_stat_generation_stage(cpi)) {
@@ -270,6 +306,13 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
&cpi->mt_info.cdef_sync);
}
+ for (int plane = 0; plane < num_planes; plane++) {
+ aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+ cpi->pick_lr_ctxt.rusi[plane] = NULL;
+ }
+ aom_free(cpi->pick_lr_ctxt.dgd_avg);
+ cpi->pick_lr_ctxt.dgd_avg = NULL;
+
aom_free_frame_buffer(&cpi->trial_frame_rst);
aom_free_frame_buffer(&cpi->scaled_source);
aom_free_frame_buffer(&cpi->scaled_last_source);
@@ -304,16 +347,12 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->svc.layer_context);
cpi->svc.layer_context = NULL;
- if (cpi->consec_zero_mv) {
- aom_free(cpi->consec_zero_mv);
- cpi->consec_zero_mv = NULL;
- cpi->consec_zero_mv_alloc_size = 0;
- }
+ aom_free(cpi->consec_zero_mv);
+ cpi->consec_zero_mv = NULL;
+ cpi->consec_zero_mv_alloc_size = 0;
- if (cpi->src_sad_blk_64x64) {
- aom_free(cpi->src_sad_blk_64x64);
- cpi->src_sad_blk_64x64 = NULL;
- }
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
aom_free(cpi->mb_weber_stats);
cpi->mb_weber_stats = NULL;
@@ -399,15 +438,23 @@ static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
"Failed to reallocate scaled source buffer");
assert(cpi->scaled_source.y_crop_width == scaled_width);
assert(cpi->scaled_source.y_crop_height == scaled_height);
- av1_resize_and_extend_frame_nonnormative(
- cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params->bit_depth,
- num_planes);
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->unscaled_source, &cpi->scaled_source,
+ (int)cm->seq_params->bit_depth, num_planes))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate buffers during resize");
return &cpi->scaled_source;
}
// Deallocate allocated thread_data.
static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ const int num_tf_workers =
+ AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers);
+ const int num_tpl_workers =
+ AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers);
+ const int is_highbitdepth = ppi->seq_params.use_highbitdepth;
+ const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE;
for (int t = 1; t < p_mt_info->num_workers; ++t) {
EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t];
thread_data->td = thread_data->original_td;
@@ -429,12 +476,42 @@ static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
thread_data->td->hash_value_buffer[x][y] = NULL;
}
}
+ aom_free(thread_data->td->mv_costs_alloc);
+ thread_data->td->mv_costs_alloc = NULL;
+ aom_free(thread_data->td->dv_costs_alloc);
+ thread_data->td->dv_costs_alloc = NULL;
aom_free(thread_data->td->counts);
- av1_free_pmc(thread_data->td->firstpass_ctx,
- ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE);
+ av1_free_pmc(thread_data->td->firstpass_ctx, num_planes);
thread_data->td->firstpass_ctx = NULL;
av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf);
av1_free_sms_tree(thread_data->td);
+ // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+ // in prepare_tf_workers() for MT encode are freed in case an error is
+ // encountered during temporal filtering (due to early termination
+ // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be
+ // invoked).
+ if (t < num_tf_workers)
+ tf_dealloc_data(&thread_data->td->tf_data, is_highbitdepth);
+ // This call ensures that tpl_tmp_buffers for MT encode are freed in case of
+ // an error during tpl.
+ if (t < num_tpl_workers)
+ tpl_dealloc_temp_buffers(&thread_data->td->tpl_tmp_buffers);
+ // This call ensures that the buffers in gm_data for MT encode are freed in
+ // case of an error during gm.
+ gm_dealloc_data(&thread_data->td->gm_data);
+ av1_dealloc_mb_data(&thread_data->td->mb, num_planes);
+ aom_free(thread_data->td->mb.sb_stats_cache);
+ thread_data->td->mb.sb_stats_cache = NULL;
+ aom_free(thread_data->td->mb.sb_fp_stats);
+ thread_data->td->mb.sb_fp_stats = NULL;
+#if CONFIG_PARTITION_SEARCH_ORDER
+ aom_free(thread_data->td->mb.rdcost);
+ thread_data->td->mb.rdcost = NULL;
+#endif
+ av1_free_pc_tree_recursive(thread_data->td->pc_root, num_planes, 0, 0,
+ SEARCH_PARTITION);
+ thread_data->td->pc_root = NULL;
+ av1_dealloc_mb_wiener_var_pred_buf(thread_data->td);
aom_free(thread_data->td);
}
}
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index bc136b187..f9e446bfb 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -521,7 +521,12 @@ static void process_tpl_stats_frame(AV1_COMP *cpi) {
cpi->ppi->p_rc.gfu_boost, gfu_boost,
cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
} else {
- const int gfu_boost = (int)(200.0 / cpi->rd.r0);
+ // TPL may only look at a subset of frame in the gf group when the
+ // speed feature 'reduce_num_frames' is on, which affects the r0
+ // calcuation. Thus, to compensate for TPL not using all frames a
+ // factor to adjust r0 is used.
+ const int gfu_boost =
+ (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0);
cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
@@ -691,6 +696,17 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
continue;
}
+ // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the
+ // motion search can be skipped for the references: last, golden, altref.
+ // If so, we can skip scaling that reference.
+ if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref &&
+ cpi->ppi->rtc_ref.set_ref_frame_config) {
+ if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue;
+ if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue;
+ if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref)
+ continue;
+ }
+
if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
// Replace the reference buffer with a copy having a thicker border,
// if the reference buffer is higher resolution than the current
@@ -745,19 +761,25 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
}
#if CONFIG_AV1_HIGHBITDEPTH
if (use_optimized_scaler && has_optimized_scaler &&
- cm->seq_params->bit_depth == AOM_BITS_8)
+ cm->seq_params->bit_depth == AOM_BITS_8) {
av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
num_planes);
- else
- av1_resize_and_extend_frame_nonnormative(
- ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
+ } else if (!av1_resize_and_extend_frame_nonnormative(
+ ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+ num_planes)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer during resize");
+ }
#else
- if (use_optimized_scaler && has_optimized_scaler)
+ if (use_optimized_scaler && has_optimized_scaler) {
av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
num_planes);
- else
- av1_resize_and_extend_frame_nonnormative(
- ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
+ } else if (!av1_resize_and_extend_frame_nonnormative(
+ ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+ num_planes)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer during resize");
+ }
#endif
cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 92e69dad1..196676ec8 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -510,8 +510,8 @@ MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
#define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT) \
HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \
aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \
- aom_highbd_obmc_variance##WIDTH##x##HEIGHT, \
- aom_highbd_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+ aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
ppi->fn_ptr[BT].osdf = OSDF; \
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 430c6aec8..c15e396f9 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -9,15 +9,17 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <assert.h>
+
#include "av1/common/warped_motion.h"
#include "av1/common/thread_common.h"
#include "av1/encoder/allintra_vis.h"
#include "av1/encoder/bitstream.h"
#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/encoder_alloc.h"
-#include "av1/encoder/encodeframe_utils.h"
#include "av1/encoder/ethread.h"
#if !CONFIG_REALTIME_ONLY
#include "av1/encoder/firstpass.h"
@@ -194,7 +196,7 @@ static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync,
}
// Deallocate row based multi-threading synchronization related mutex and data
-static void row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
if (row_mt_sync != NULL) {
#if CONFIG_MULTITHREAD
int i;
@@ -264,6 +266,8 @@ static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
enc_row_mt->allocated_rows = max_rows;
enc_row_mt->allocated_cols = max_cols - 1;
enc_row_mt->allocated_sb_rows = sb_rows;
+ enc_row_mt->row_mt_exit = false;
+ enc_row_mt->firstpass_mt_exit = false;
}
void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
@@ -278,7 +282,7 @@ void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
int tile_index = tile_row * tile_cols + tile_col;
TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
- row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+ av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx);
}
@@ -391,18 +395,68 @@ static AOM_INLINE void switch_tile_and_get_next_job(
}
#if !CONFIG_REALTIME_ONLY
+static void set_firstpass_encode_done(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+ const int unit_height = mi_size_high[fp_block_size];
+
+ // In case of multithreading of firstpass encode, due to top-right
+ // dependency, the worker on a firstpass row waits for the completion of the
+ // firstpass processing of the top and top-right fp_blocks. Hence, in case a
+ // thread (main/worker) encounters an error, update the firstpass processing
+ // of every row in the frame to indicate that it is complete in order to avoid
+ // dependent workers waiting indefinitely.
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *tile = &tile_data->tile_info;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+ const int unit_cols_in_tile =
+ av1_get_unit_cols_in_tile(tile, fp_block_size);
+ for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0;
+ mi_row < tile->mi_row_end;
+ mi_row += unit_height, unit_row_in_tile++) {
+ enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile,
+ unit_cols_in_tile - 1, unit_cols_in_tile);
+ }
+ }
+ }
+}
+
static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
EncWorkerData *const thread_data = (EncWorkerData *)arg1;
AV1_COMP *const cpi = thread_data->cpi;
- AV1_COMMON *const cm = &cpi->common;
int thread_id = thread_data->thread_id;
AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
- int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
#if CONFIG_MULTITHREAD
pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
#endif
(void)unused;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ enc_row_mt->firstpass_mt_exit = true;
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ set_firstpass_encode_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+ AV1_COMMON *const cm = &cpi->common;
+ int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
assert(cur_tile_id != -1);
const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
@@ -413,8 +467,9 @@ static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
#if CONFIG_MULTITHREAD
pthread_mutex_lock(enc_row_mt_mutex_);
#endif
- if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
- unit_height)) {
+ bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+ if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id],
+ &current_mi_row, unit_height)) {
// No jobs are available for the current tile. Query for the status of
// other tiles and get the next job if available
switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
@@ -424,7 +479,9 @@ static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
#if CONFIG_MULTITHREAD
pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
- if (end_of_frame == 1) break;
+ // When firstpass_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (firstpass_mt_exit || end_of_frame) break;
TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
@@ -444,7 +501,7 @@ static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
}
-
+ error_info->setjmp = 0;
return 1;
}
#endif
@@ -455,6 +512,7 @@ static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync;
const int sb_rows = get_sb_rows_in_frame(cm);
AV1LfMTInfo *cur_job_info;
+ bool row_mt_exit = false;
(void)enc_row_mt;
#if CONFIG_MULTITHREAD
pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
@@ -469,43 +527,116 @@ static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1);
// Wait for current and next superblock row to finish encoding.
pthread_mutex_lock(enc_row_mt_mutex_);
- while (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
- enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols) {
+ while (!enc_row_mt->row_mt_exit &&
+ (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
+ enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) {
pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_);
}
+ row_mt_exit = enc_row_mt->row_mt_exit;
pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
+ if (row_mt_exit) return;
+
av1_thread_loop_filter_rows(
lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
- lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf,
- mib_size_log2);
+ lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf,
+ lf_data->tx_buf, mib_size_log2);
+ }
+}
+
+static void set_encoding_done(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int mib_size = cm->seq_params->mib_size;
+
+ // In case of row-multithreading, due to top-right dependency, the worker on
+ // an SB row waits for the completion of the encode of the top and top-right
+ // SBs. Hence, in case a thread (main/worker) encounters an error, update that
+ // encoding of every SB row in the frame is complete in order to avoid the
+ // dependent workers of every tile from waiting indefinitely.
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+ for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0;
+ mi_row < tile_info->mi_row_end;
+ mi_row += mib_size, sb_row_in_tile++) {
+ enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile,
+ sb_cols_in_tile - 1, sb_cols_in_tile);
+ }
+ }
}
}
static int enc_row_mt_worker_hook(void *arg1, void *unused) {
EncWorkerData *const thread_data = (EncWorkerData *)arg1;
AV1_COMP *const cpi = thread_data->cpi;
- AV1_COMMON *const cm = &cpi->common;
int thread_id = thread_data->thread_id;
AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
- int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
- const int mib_size_log2 = cm->seq_params->mib_size_log2;
#if CONFIG_MULTITHREAD
pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
#endif
(void)unused;
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ AV1LfSync *const lf_sync = thread_data->lf_sync;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ enc_row_mt->row_mt_exit = true;
+ // Wake up all the workers waiting in launch_loop_filter_rows() to exit in
+ // case of an error.
+ pthread_cond_broadcast(enc_row_mt->cond_);
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ set_encoding_done(cpi);
+
+ if (cpi->mt_info.pipeline_lpf_mt_with_enc) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(lf_sync->job_mutex);
+ lf_sync->lf_mt_exit = true;
+ pthread_mutex_unlock(lf_sync->job_mutex);
+#endif
+ av1_set_vert_loop_filter_done(&cpi->common, lf_sync,
+ cpi->common.seq_params->mib_size_log2);
+ }
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+
// Preallocate the pc_tree for realtime coding to reduce the cost of memory
// allocation.
- thread_data->td->rt_pc_root =
- cpi->sf.rt_sf.use_nonrd_pick_mode
- ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
- : NULL;
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!thread_data->td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ thread_data->td->pc_root = NULL;
+ }
assert(cur_tile_id != -1);
const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
int end_of_frame = 0;
+ bool row_mt_exit = false;
// When master thread does not have a valid job to process, xd->tile_ctx
// is not set and it contains NULL pointer. This can result in NULL pointer
@@ -518,7 +649,12 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
#if CONFIG_MULTITHREAD
pthread_mutex_lock(enc_row_mt_mutex_);
#endif
- if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+ row_mt_exit = enc_row_mt->row_mt_exit;
+ // row_mt_exit check here can be avoided as it is checked after
+ // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here,
+ // tries to return before calling the function get_next_job().
+ if (!row_mt_exit &&
+ !get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
cm->seq_params->mib_size)) {
// No jobs are available for the current tile. Query for the status of
// other tiles and get the next job if available
@@ -529,7 +665,14 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
#if CONFIG_MULTITHREAD
pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
- if (end_of_frame == 1) break;
+ // When row_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (row_mt_exit) {
+ error_info->setjmp = 0;
+ return 1;
+ }
+
+ if (end_of_frame) break;
TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
@@ -583,26 +726,46 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
// encoding and loop filter stage.
launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2);
}
- av1_free_pc_tree_recursive(thread_data->td->rt_pc_root, av1_num_planes(cm), 0,
- 0, cpi->sf.part_sf.partition_search_type);
+ av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ thread_data->td->pc_root = NULL;
+ error_info->setjmp = 0;
return 1;
}
static int enc_worker_hook(void *arg1, void *unused) {
EncWorkerData *const thread_data = (EncWorkerData *)arg1;
AV1_COMP *const cpi = thread_data->cpi;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
const AV1_COMMON *const cm = &cpi->common;
const int tile_cols = cm->tiles.cols;
const int tile_rows = cm->tiles.rows;
int t;
(void)unused;
+
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+ return 0;
+ }
+ error_info->setjmp = 1;
+
// Preallocate the pc_tree for realtime coding to reduce the cost of memory
// allocation.
- thread_data->td->rt_pc_root =
- cpi->sf.rt_sf.use_nonrd_pick_mode
- ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
- : NULL;
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!thread_data->td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ thread_data->td->pc_root = NULL;
+ }
for (t = thread_data->start; t < tile_rows * tile_cols;
t += cpi->mt_info.num_workers) {
@@ -616,9 +779,10 @@ static int enc_worker_hook(void *arg1, void *unused) {
av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
}
- av1_free_pc_tree_recursive(thread_data->td->rt_pc_root, av1_num_planes(cm), 0,
- 0, cpi->sf.part_sf.partition_search_type);
-
+ av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ thread_data->td->pc_root = NULL;
+ error_info->setjmp = 0;
return 1;
}
@@ -651,10 +815,11 @@ void av1_init_lr_mt_buffers(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
if (lr_sync->sync_range) {
- int num_lr_workers =
- av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
return;
+ int num_lr_workers =
+ av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+ assert(num_lr_workers <= lr_sync->num_workers);
lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
}
@@ -720,16 +885,21 @@ void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers);
}
+ // Initialize tpl MT object.
+ AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt;
+ if (tpl_row_mt->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_,
+ aom_malloc(sizeof(*(tpl_row_mt->mutex_))));
+ if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL);
+ }
+ tpl_row_mt->tpl_mt_exit = false;
+
#if !CONFIG_REALTIME_ONLY
if (is_restoration_used(cm)) {
// Initialize loop restoration MT object.
AV1LrSync *lr_sync = &mt_info->lr_row_sync;
- int rst_unit_size;
- if (cm->width * cm->height > 352 * 288)
- rst_unit_size = RESTORATION_UNITSIZE_MAX;
- else
- rst_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
- int num_rows_lr = av1_lr_count_units_in_tile(rst_unit_size, cm->height);
+ int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+ int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height);
int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
num_lr_workers > lr_sync->num_workers ||
@@ -754,7 +924,7 @@ void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
// Computes the number of workers to be considered while allocating memory for a
// multi-threaded module under FPMT.
-int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
MULTI_THREADED_MODULES mod_name) {
int num_mod_workers = p_mt_info->num_mod_workers[mod_name];
if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) {
@@ -798,6 +968,9 @@ void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
// Set up firstpass PICK_MODE_CONTEXT.
thread_data->td->firstpass_ctx = av1_alloc_pmc(
ppi->cpi, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
+ if (!thread_data->td->firstpass_ctx)
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
}
if (!is_first_pass && i < num_enc_workers) {
@@ -894,6 +1067,7 @@ void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) {
PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ assert(p_mt_info->num_workers == 0);
AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers,
aom_malloc(num_workers * sizeof(*p_mt_info->workers)));
@@ -925,6 +1099,17 @@ void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) {
}
}
+// This function will change the state and free the mutex of corresponding
+// workers and terminate the object. The object can not be re-used unless a call
+// to reset() is made.
+void av1_terminate_workers(AV1_PRIMARY *ppi) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ for (int t = 0; t < p_mt_info->num_workers; ++t) {
+ AVxWorker *const worker = &p_mt_info->workers[t];
+ aom_get_worker_interface()->end(worker);
+ }
+}
+
// This function returns 1 if frame parallel encode is supported for
// the current configuration. Returns 0 otherwise.
static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
@@ -1111,6 +1296,7 @@ static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
if (is_restoration_used(cm)) {
// Back up the original LR buffers before update.
int idx = i + mt_info->num_workers - 1;
+ assert(idx < mt_info->lr_row_sync.num_workers);
mt_info->restore_state_buf.rst_tmpbuf =
mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
mt_info->restore_state_buf.rlbs =
@@ -1153,27 +1339,6 @@ static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
}
}
-// Synchronize level 1 workers.
-static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi) {
- const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- int num_workers = ppi->p_mt_info.p_num_workers;
- int had_error = 0;
- // Points to error in the earliest display order frame in the parallel set.
- const struct aom_internal_error_info *error;
-
- // Encoding ends.
- for (int i = num_workers - 1; i >= 0; i--) {
- AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
- if (!winterface->sync(worker)) {
- had_error = 1;
- error = ((AV1_COMP *)worker->data1)->common.error;
- }
- }
-
- if (had_error)
- aom_internal_error(&ppi->error, error->error_code, "%s", error->detail);
-}
-
// Restore worker states after parallel encode.
static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
int parallel_frame_count) {
@@ -1203,6 +1368,7 @@ static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
if (is_restoration_used(cm)) {
// Restore the original LR buffers.
int idx = i + mt_info->num_workers - 1;
+ assert(idx < mt_info->lr_row_sync.num_workers);
mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
mt_info->restore_state_buf.rst_tmpbuf;
mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
@@ -1215,6 +1381,30 @@ static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
}
}
+// Synchronize level 1 workers.
+static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi,
+ int frames_in_parallel_set) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int num_workers = ppi->p_mt_info.p_num_workers;
+ int had_error = 0;
+ // Points to error in the earliest display order frame in the parallel set.
+ const struct aom_internal_error_info *error;
+
+ // Encoding ends.
+ for (int i = num_workers - 1; i >= 0; --i) {
+ AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error = ppi->parallel_cpi[i]->common.error;
+ }
+ }
+
+ restore_workers_after_fpmt(ppi, frames_in_parallel_set);
+
+ if (had_error)
+ aom_internal_error(&ppi->error, error->error_code, "%s", error->detail);
+}
+
static int get_compressed_data_hook(void *arg1, void *arg2) {
AV1_COMP *cpi = (AV1_COMP *)arg1;
AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2;
@@ -1236,8 +1426,7 @@ int av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook,
frames_in_parallel_set);
launch_fpmt_workers(ppi);
- sync_fpmt_workers(ppi);
- restore_workers_after_fpmt(ppi, frames_in_parallel_set);
+ sync_fpmt_workers(ppi, frames_in_parallel_set);
// Release cpi->scaled_ref_buf corresponding to frames in the current parallel
// encode set.
@@ -1254,6 +1443,7 @@ static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &mt_info->workers[i];
+ worker->had_error = 0;
if (i == 0)
winterface->execute(worker);
else
@@ -1264,17 +1454,33 @@ static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
AV1_COMMON *const cm, int num_workers) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- int had_error = mt_info->workers[0].had_error;
+ const AVxWorker *const worker_main = &mt_info->workers[0];
+ int had_error = worker_main->had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ error_info = ((EncWorkerData *)worker_main->data1)->error_info;
+ }
// Encoding ends.
for (int i = num_workers - 1; i > 0; i--) {
AVxWorker *const worker = &mt_info->workers[i];
- had_error |= !winterface->sync(worker);
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((EncWorkerData *)worker->data1)->error_info;
+ }
}
if (had_error)
- aom_internal_error(cm->error, AOM_CODEC_ERROR,
- "Failed to encode tile data");
+ aom_internal_error(cm->error, error_info.error_code, "%s",
+ error_info.detail);
+
+ // Restore xd->error_info of the main thread back to cm->error so that the
+ // multithreaded code, when executed using a single thread, has a valid
+ // xd->error_info.
+ MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd;
+ xd->error_info = cm->error;
}
static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
@@ -1292,13 +1498,15 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
// Keep these conditional expressions in sync with the corresponding ones
// in prepare_enc_workers().
if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
- aom_free(thread_data->td->mb.mv_costs);
+ aom_free(thread_data->td->mv_costs_alloc);
+ thread_data->td->mv_costs_alloc = NULL;
}
if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
- aom_free(thread_data->td->mb.dv_costs);
+ aom_free(thread_data->td->dv_costs_alloc);
+ thread_data->td->dv_costs_alloc = NULL;
}
}
- av1_dealloc_mb_data(&cpi->common, &thread_data->td->mb);
+ av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common));
// Accumulate counters.
if (i > 0) {
@@ -1362,8 +1570,10 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
// Keep these conditional expressions in sync with the corresponding ones
// in accumulate_counters_enc_workers().
if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
- CHECK_MEM_ERROR(cm, thread_data->td->mb.mv_costs,
- (MvCosts *)aom_malloc(sizeof(MvCosts)));
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->mv_costs_alloc,
+ (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc)));
+ thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc;
memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
sizeof(MvCosts));
}
@@ -1373,8 +1583,10 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
// aom_free() call for the same.
thread_data->td->mb.dv_costs = NULL;
if (av1_need_dv_costs(cpi)) {
- CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs,
- (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts)));
+ CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc,
+ (IntraBCMVCosts *)aom_malloc(
+ sizeof(*thread_data->td->dv_costs_alloc)));
+ thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc;
memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
sizeof(IntraBCMVCosts));
}
@@ -1438,26 +1650,17 @@ static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
thread_data->td = thread_data->original_td;
}
- // Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
+ // Before encoding a frame, copy the thread data from cpi.
thread_data->td->mb = cpi->td.mb;
- // Keep this conditional expression in sync with the corresponding one
- // in av1_fp_encode_tiles_row_mt().
- if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
- CHECK_MEM_ERROR(cm, thread_data->td->mb.mv_costs,
- (MvCosts *)aom_malloc(sizeof(MvCosts)));
- memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
- sizeof(MvCosts));
- }
+ av1_alloc_src_diff_buf(cm, &thread_data->td->mb);
}
-
- av1_alloc_mb_data(cpi, &thread_data->td->mb);
}
}
#endif
// Computes the number of workers for row multi-threading of encoding stage
-static AOM_INLINE int compute_num_enc_row_mt_workers(AV1_COMMON *const cm,
+static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
int max_threads) {
TileInfo tile_info;
const int tile_cols = cm->tiles.cols;
@@ -1476,7 +1679,7 @@ static AOM_INLINE int compute_num_enc_row_mt_workers(AV1_COMMON *const cm,
}
// Computes the number of workers for tile multi-threading of encoding stage
-static AOM_INLINE int compute_num_enc_tile_mt_workers(AV1_COMMON *const cm,
+static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
int max_threads) {
const int tile_cols = cm->tiles.cols;
const int tile_rows = cm->tiles.rows;
@@ -1494,7 +1697,7 @@ int av1_get_max_num_workers(const AV1_COMP *cpi) {
}
// Computes the number of workers for encoding stage (row/tile multi-threading)
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) {
if (max_workers <= 1) return 1;
if (cpi->oxcf.row_mt)
return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
@@ -1752,6 +1955,15 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
}
#if !CONFIG_REALTIME_ONLY
+static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) {
+ for (int i = num_workers - 1; i >= 0; --i) {
+ EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+ if (thread_data->td != &cpi->td)
+ av1_dealloc_src_diff_buf(&thread_data->td->mb,
+ av1_num_planes(&cpi->common));
+ }
+}
+
void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
MultiThreadInfo *const mt_info = &cpi->mt_info;
@@ -1814,18 +2026,7 @@ void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
launch_workers(&cpi->mt_info, num_workers);
sync_enc_workers(&cpi->mt_info, cm, num_workers);
- for (int i = num_workers - 1; i >= 0; i--) {
- EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
- if (thread_data->td != &cpi->td) {
- // Keep this conditional expression in sync with the corresponding one
- // in fp_prepare_enc_workers().
- if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
- aom_free(thread_data->td->mb.mv_costs);
- }
- assert(!thread_data->td->mb.dv_costs);
- }
- av1_dealloc_mb_data(cm, &thread_data->td->mb);
- }
+ dealloc_thread_data_src_diff_buf(cpi, num_workers);
}
void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
@@ -1894,6 +2095,27 @@ void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
#endif // CONFIG_MULTITHREAD
}
+static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const BLOCK_SIZE bsize =
+ convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ const int mi_height = mi_size_high[bsize];
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+ const int tplb_cols_in_tile =
+ ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+ // In case of tpl row-multithreading, due to top-right dependency, the worker
+ // on an mb_row waits for the completion of the tpl processing of the top and
+ // top-right blocks. Hence, in case a thread (main/worker) encounters an
+ // error, update that the tpl processing of every mb_row in the frame is
+ // complete in order to avoid dependent workers waiting indefinitely.
+ for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows;
+ mi_row += mi_height, tplb_row++) {
+ (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+ tplb_cols_in_tile - 1, tplb_cols_in_tile);
+ }
+}
+
// Each worker calls tpl_worker_hook() and computes the tpl data.
static int tpl_worker_hook(void *arg1, void *unused) {
(void)unused;
@@ -1903,11 +2125,36 @@ static int tpl_worker_hook(void *arg1, void *unused) {
MACROBLOCK *x = &thread_data->td->mb;
MACROBLOCKD *xd = &x->e_mbd;
TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
+ TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers;
CommonModeInfoParams *mi_params = &cm->mi_params;
+ int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+ (void)tpl_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_;
+#endif
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tpl_error_mutex_);
+ tpl_row_mt->tpl_mt_exit = true;
+ pthread_mutex_unlock(tpl_error_mutex_);
+#endif
+ set_mode_estimation_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
TX_SIZE tx_size = max_txsize_lookup[bsize];
int mi_height = mi_size_high[bsize];
- int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
av1_init_tpl_txfm_stats(tpl_txfm_stats);
@@ -1919,8 +2166,10 @@ static int tpl_worker_hook(void *arg1, void *unused) {
xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
xd->mb_to_bottom_edge =
GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
- av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, x, mi_row, bsize, tx_size);
+ av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row,
+ bsize, tx_size);
}
+ error_info->setjmp = 0;
return 1;
}
@@ -2005,6 +2254,11 @@ static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
// OBMC buffers are used only to init MS params and remain unused when
// called from tpl, hence set the buffers to defaults.
av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+ if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers,
+ cpi->ppi->tpl_data.tpl_bsize_1d)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating tpl data");
+ }
thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
}
@@ -2056,6 +2310,11 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
#if CONFIG_BITRATE_ACCURACY
tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
#endif // CONFIG_BITRATE_ACCURACY
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+ }
}
// Deallocate memory for temporal filter multi-thread synchronization.
@@ -2079,7 +2338,7 @@ static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
pthread_mutex_lock(tf_mutex_);
#endif
- if (tf_mt_sync->next_tf_row < mb_rows) {
+ if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) {
*current_mb_row = tf_mt_sync->next_tf_row;
tf_mt_sync->next_tf_row++;
do_next_row = 1;
@@ -2099,6 +2358,28 @@ static int tf_worker_hook(void *arg1, void *unused) {
TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
const struct scale_factors *scale = &cpi->tf_ctx.sf;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tf_mutex_ = tf_sync->mutex_;
+#endif
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tf_mutex_);
+ tf_sync->tf_mt_exit = true;
+ pthread_mutex_unlock(tf_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
const int num_planes = av1_num_planes(&cpi->common);
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -2115,6 +2396,7 @@ static int tf_worker_hook(void *arg1, void *unused) {
tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+ error_info->setjmp = 0;
return 1;
}
@@ -2123,6 +2405,7 @@ static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
int num_workers, int is_highbitdepth) {
MultiThreadInfo *mt_info = &cpi->mt_info;
mt_info->tf_sync.next_tf_row = 0;
+ mt_info->tf_sync.tf_mt_exit = false;
for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *worker = &mt_info->workers[i];
EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
@@ -2227,19 +2510,6 @@ static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx,
get_next_gm_job(cpi, frame_idx, *(cur_dir));
}
-// Initializes inliers, num_inliers and segment_map.
-static AOM_INLINE void init_gm_thread_data(
- const GlobalMotionInfo *gm_info, GlobalMotionThreadData *thread_data) {
- for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
- MotionModel motion_params = thread_data->motion_models[m];
- av1_zero(motion_params.params);
- motion_params.num_inliers = 0;
- }
-
- av1_zero_array(thread_data->segment_map,
- gm_info->segment_map_w * gm_info->segment_map_h);
-}
-
// Hook function for each thread in global motion multi-threading.
static int gm_mt_worker_hook(void *arg1, void *unused) {
(void)unused;
@@ -2247,16 +2517,34 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
EncWorkerData *thread_data = (EncWorkerData *)arg1;
AV1_COMP *cpi = thread_data->cpi;
GlobalMotionInfo *gm_info = &cpi->gm_info;
- MultiThreadInfo *mt_info = &cpi->mt_info;
- JobInfo *job_info = &mt_info->gm_sync.job_info;
+ AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+ JobInfo *job_info = &gm_sync->job_info;
int thread_id = thread_data->thread_id;
- GlobalMotionThreadData *gm_thread_data =
- &mt_info->gm_sync.thread_data[thread_id];
- int cur_dir = job_info->thread_id_to_dir[thread_id];
+ GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_;
+#endif
+
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
#if CONFIG_MULTITHREAD
- pthread_mutex_t *gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+ pthread_mutex_lock(gm_mt_mutex_);
+ gm_sync->gm_mt_exit = true;
+ pthread_mutex_unlock(gm_mt_mutex_);
#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+ int cur_dir = job_info->thread_id_to_dir[thread_id];
+ bool gm_mt_exit = false;
while (1) {
int ref_buf_idx = -1;
@@ -2264,9 +2552,10 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
pthread_mutex_lock(gm_mt_mutex_);
#endif
+ gm_mt_exit = gm_sync->gm_mt_exit;
// Populates ref_buf_idx(the reference frame type) for which global motion
// estimation will be done.
- if (!get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
+ if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
// No jobs are available for the current direction. Switch
// to other direction and get the next job, if available.
switch_direction(cpi, &ref_buf_idx, &cur_dir);
@@ -2276,15 +2565,15 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
pthread_mutex_unlock(gm_mt_mutex_);
#endif
- if (ref_buf_idx == -1) break;
-
- init_gm_thread_data(gm_info, gm_thread_data);
+ // When gm_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (gm_mt_exit || ref_buf_idx == -1) break;
// Compute global motion for the given ref_buf_idx.
av1_compute_gm_for_valid_ref_frames(
- cpi, gm_info->ref_buf, ref_buf_idx, gm_thread_data->motion_models,
- gm_thread_data->segment_map, gm_info->segment_map_w,
- gm_info->segment_map_h);
+ cpi, error_info, gm_info->ref_buf, ref_buf_idx,
+ gm_thread_data->motion_models, gm_thread_data->segment_map,
+ gm_info->segment_map_w, gm_info->segment_map_h);
#if CONFIG_MULTITHREAD
pthread_mutex_lock(gm_mt_mutex_);
@@ -2300,6 +2589,7 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
pthread_mutex_unlock(gm_mt_mutex_);
#endif
}
+ error_info->setjmp = 0;
return 1;
}
@@ -2307,6 +2597,7 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
int num_workers) {
MultiThreadInfo *mt_info = &cpi->mt_info;
+ mt_info->gm_sync.gm_mt_exit = false;
for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *worker = &mt_info->workers[i];
EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
@@ -2325,6 +2616,9 @@ static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
} else {
thread_data->td = thread_data->original_td;
}
+
+ if (thread_data->td != &cpi->td)
+ gm_alloc_data(cpi, &thread_data->td->gm_data);
}
}
@@ -2351,69 +2645,28 @@ static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
}
// Frees the memory allocated for each worker in global motion multi-threading.
-void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data) {
- if (gm_sync_data->thread_data != NULL) {
- for (int j = 0; j < gm_sync_data->allocated_workers; j++) {
- GlobalMotionThreadData *thread_data = &gm_sync_data->thread_data[j];
- aom_free(thread_data->segment_map);
-
- for (int m = 0; m < RANSAC_NUM_MOTIONS; m++)
- aom_free(thread_data->motion_models[m].inliers);
- }
- aom_free(gm_sync_data->thread_data);
- }
-}
-
-// Allocates memory for inliers and segment_map for each worker in global motion
-// multi-threading.
-static AOM_INLINE void gm_alloc(AV1_COMP *cpi, int num_workers) {
- AV1_COMMON *cm = &cpi->common;
- AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
- GlobalMotionInfo *gm_info = &cpi->gm_info;
-
- gm_sync->allocated_workers = num_workers;
- gm_sync->allocated_width = cpi->source->y_width;
- gm_sync->allocated_height = cpi->source->y_height;
-
- CHECK_MEM_ERROR(cm, gm_sync->thread_data,
- aom_malloc(sizeof(*gm_sync->thread_data) * num_workers));
-
- for (int i = 0; i < num_workers; i++) {
- GlobalMotionThreadData *thread_data = &gm_sync->thread_data[i];
- CHECK_MEM_ERROR(
- cm, thread_data->segment_map,
- aom_malloc(sizeof(*thread_data->segment_map) * gm_info->segment_map_w *
- gm_info->segment_map_h));
-
- for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
- CHECK_MEM_ERROR(
- cm, thread_data->motion_models[m].inliers,
- aom_malloc(sizeof(*thread_data->motion_models[m].inliers) * 2 *
- MAX_CORNERS));
- }
+static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int j = 0; j < num_workers; j++) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) gm_dealloc_data(&td->gm_data);
}
}
// Implements multi-threading for global motion.
void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
- AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
- JobInfo *job_info = &gm_sync->job_info;
+ JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
av1_zero(*job_info);
int num_workers = compute_gm_workers(cpi);
- if (num_workers > gm_sync->allocated_workers ||
- cpi->source->y_width != gm_sync->allocated_width ||
- cpi->source->y_height != gm_sync->allocated_height) {
- av1_gm_dealloc(gm_sync);
- gm_alloc(cpi, num_workers);
- }
-
assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
launch_workers(&cpi->mt_info, num_workers);
sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
+ gm_dealloc_thread_data(cpi, num_workers);
}
#endif // !CONFIG_REALTIME_ONLY
@@ -2455,6 +2708,7 @@ static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi,
if (thread_data->td != &cpi->td) {
thread_data->td->mb = cpi->td.mb;
+ av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td);
}
}
}
@@ -2496,7 +2750,8 @@ static int cal_mb_wiener_var_hook(void *arg1, void *unused) {
// TODO(chengchen): properly accumulate the distortion and rate.
av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff,
qcoeff, dqcoeff, &sum_rec_distortion,
- &sum_est_rate);
+ &sum_est_rate,
+ thread_data->td->wiener_tmp_pred_buf);
#if CONFIG_MULTITHREAD
pthread_mutex_lock(enc_row_mt_mutex_);
#endif
@@ -2508,6 +2763,17 @@ static int cal_mb_wiener_var_hook(void *arg1, void *unused) {
return 1;
}
+static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) {
+ av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int j = 0; j < num_workers; ++j) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td);
+ }
+}
+
// This function is the multi-threading version of computing the wiener
// variance.
// Note that the wiener variance is used for allintra mode (1 pass) and its
@@ -2537,8 +2803,7 @@ void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers);
launch_workers(mt_info, num_workers);
sync_enc_workers(mt_info, cm, num_workers);
-
- row_mt_sync_mem_dealloc(intra_row_mt_sync);
+ dealloc_mb_wiener_var_mt_data(cpi, num_workers);
}
// Compare and order tiles based on absolute sum of tx coeffs.
@@ -2713,14 +2978,40 @@ static int pack_bs_worker_hook(void *arg1, void *arg2) {
const CommonTileParams *const tiles = &cm->tiles;
const int num_tiles = tiles->cols * tiles->rows;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_;
+#endif
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pack_bs_mutex);
+ pack_bs_sync->pack_bs_mt_exit = true;
+ pthread_mutex_unlock(pack_bs_mutex);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
while (1) {
#if CONFIG_MULTITHREAD
- pthread_mutex_lock(pack_bs_sync->mutex_);
+ pthread_mutex_lock(pack_bs_mutex);
#endif
- const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+ const int tile_idx =
+ pack_bs_sync->pack_bs_mt_exit
+ ? -1
+ : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
#if CONFIG_MULTITHREAD
- pthread_mutex_unlock(pack_bs_sync->mutex_);
+ pthread_mutex_unlock(pack_bs_mutex);
#endif
+ // When pack_bs_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
if (tile_idx == -1) break;
TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
@@ -2728,6 +3019,7 @@ static int pack_bs_worker_hook(void *arg1, void *arg2) {
av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
}
+ error_info->setjmp = 0;
return 1;
}
@@ -2902,8 +3194,9 @@ static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
// populates next job information and returns 1, else returns 0.
static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
CdefSearchCtx *cdef_search_ctx,
- int *cur_fbr, int *cur_fbc,
- int *sb_count) {
+ volatile int *cur_fbr,
+ volatile int *cur_fbc,
+ volatile int *sb_count) {
#if CONFIG_MULTITHREAD
pthread_mutex_lock(cdef_sync->mutex_);
#endif // CONFIG_MULTITHREAD
@@ -2913,15 +3206,15 @@ static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
// If a block is skip, do not process the block and
// check the skip condition for the next block.
- while ((!cdef_sync->end_of_frame) &&
- (cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
- cdef_sync->fbc))) {
+ while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame &&
+ cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
+ cdef_sync->fbc)) {
update_next_job_info(cdef_sync, nvfb, nhfb);
}
// Populates information needed for current job and update the row,
// column indices of the next block to be processed.
- if (cdef_sync->end_of_frame == 0) {
+ if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
do_next_block = 1;
*cur_fbr = cdef_sync->fbr;
*cur_fbc = cdef_sync->fbc;
@@ -2937,43 +3230,68 @@ static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
// Hook function for each thread in CDEF search multi-threading.
static int cdef_filter_block_worker_hook(void *arg1, void *arg2) {
- AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
- CdefSearchCtx *cdef_search_ctx = (CdefSearchCtx *)arg2;
- int cur_fbr, cur_fbc, sb_count;
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_;
+#endif
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(cdef_mutex_);
+ cdef_sync->cdef_mt_exit = true;
+ pthread_mutex_unlock(cdef_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ volatile int cur_fbr, cur_fbc, sb_count;
while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc,
&sb_count)) {
- av1_cdef_mse_calc_block(cdef_search_ctx, cur_fbr, cur_fbc, sb_count);
+ av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc,
+ sb_count);
}
+ error_info->setjmp = 0;
return 1;
}
// Assigns CDEF search hook function and thread data to each worker.
-static void prepare_cdef_workers(MultiThreadInfo *mt_info,
- CdefSearchCtx *cdef_search_ctx,
- AVxWorkerHook hook, int num_workers) {
+static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ thread_data->cpi = cpi;
worker->hook = hook;
- worker->data1 = &mt_info->cdef_sync;
- worker->data2 = cdef_search_ctx;
+ worker->data1 = thread_data;
+ worker->data2 = &mt_info->cdef_sync;
}
}
// Implements multi-threading for CDEF search.
-void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
- CdefSearchCtx *cdef_search_ctx) {
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
cdef_reset_job_info(cdef_sync);
- prepare_cdef_workers(mt_info, cdef_search_ctx, cdef_filter_block_worker_hook,
- num_workers);
+ prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers);
launch_workers(mt_info, num_workers);
- sync_enc_workers(mt_info, cm, num_workers);
+ sync_enc_workers(mt_info, &cpi->common, num_workers);
}
// Computes num_workers for temporal filter multi-threading.
-static AOM_INLINE int compute_num_tf_workers(AV1_COMP *cpi) {
+static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) {
// For single-pass encode, using no. of workers as per tf block size was not
// found to improve speed. Hence the thread assignment for single-pass encode
// is kept based on compute_num_enc_workers().
@@ -3058,11 +3376,10 @@ static int compute_num_mod_workers(AV1_COMP *cpi,
case MOD_AI:
if (cpi->oxcf.pass == AOM_RC_ONE_PASS) {
num_mod_workers = compute_num_ai_workers(cpi);
- break;
} else {
num_mod_workers = 0;
- break;
}
+ break;
default: assert(0); break;
}
return (num_mod_workers);
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index 6c4bce4db..f3f86292b 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -23,6 +23,7 @@ typedef struct EncWorkerData {
struct AV1_COMP *cpi;
struct ThreadData *td;
struct ThreadData *original_td;
+ struct aom_internal_error_info error_info;
AV1LfSync *lf_sync;
LFWorkerData *lf_data;
int start;
@@ -52,9 +53,9 @@ void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
-void av1_global_motion_estimation_mt(AV1_COMP *cpi);
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync);
-void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data);
+void av1_global_motion_estimation_mt(AV1_COMP *cpi);
#if !CONFIG_REALTIME_ONLY
void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
@@ -87,6 +88,8 @@ int av1_get_max_num_workers(const AV1_COMP *cpi);
void av1_create_workers(AV1_PRIMARY *ppi, int num_workers);
+void av1_terminate_workers(AV1_PRIMARY *ppi);
+
void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi);
void av1_init_cdef_worker(AV1_COMP *cpi);
@@ -99,13 +102,12 @@ void av1_init_lr_mt_buffers(AV1_COMP *cpi);
void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
#endif // CONFIG_MULTITHREAD
-int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
MULTI_THREADED_MODULES mod_name);
void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
-void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
- CdefSearchCtx *cdef_search_ctx);
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi);
void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
@@ -116,7 +118,7 @@ void av1_write_tile_obu_mt(
unsigned int *max_tile_size, uint32_t *const obu_header_size,
uint8_t **tile_data_start, const int num_workers);
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers);
int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 1fad149e3..363111310 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -33,6 +33,7 @@
#include "av1/encoder/block.h"
#include "av1/encoder/dwt.h"
#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
#include "av1/encoder/encodemb.h"
#include "av1/encoder/encodemv.h"
#include "av1/encoder/encoder.h"
@@ -251,39 +252,69 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
// Refine the motion search range according to the frame dimension
// for first pass test.
-static int get_search_range(const InitialDimensions *initial_dimensions) {
+static int get_search_range(int width, int height) {
int sr = 0;
- const int dim = AOMMIN(initial_dimensions->width, initial_dimensions->height);
+ const int dim = AOMMIN(width, height);
while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
return sr;
}
+static AOM_INLINE const search_site_config *
+av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x,
+ SEARCH_METHODS search_method) {
+ const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+ // For AVIF applications, even the source frames can have changing resolution,
+ // so we need to manually check for the strides :(
+ // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+ // that's shared by multiple threads. In most cases where all frames have the
+ // same resolution, the cache contains the search site config that we need.
+ const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+ if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) {
+ return mv_search_params->search_site_cfg[SS_CFG_FPF];
+ }
+
+ // If the cache does not contain the correct stride, then we will need to rely
+ // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+ // thread level config doesn't match, then we need to update it.
+ search_method = search_method_lookup[search_method];
+ assert(search_method_lookup[search_method] == search_method &&
+ "The search_method_lookup table should be idempotent.");
+ if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+ av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+ ref_stride);
+ }
+
+ return x->search_site_cfg_buf;
+}
+
static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
const MV *ref_mv,
FULLPEL_MV *best_mv,
int *best_motion_err) {
+ AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
int tmp_err;
const BLOCK_SIZE bsize = xd->mi[0]->bsize;
const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
- const int sr = get_search_range(&cpi->initial_dimensions);
+ const int sr = get_search_range(cm->width, cm->height);
const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
const search_site_config *first_pass_search_sites =
- cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+ av1_get_first_pass_search_site_config(cpi, x, NSTEP);
const int fine_search_interval =
- cpi->is_screen_content_type && cpi->common.features.allow_intrabc;
+ cpi->is_screen_content_type && cm->features.allow_intrabc;
FULLPEL_MOTION_SEARCH_PARAMS ms_params;
av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
- start_mv, first_pass_search_sites,
+ start_mv, first_pass_search_sites, NSTEP,
fine_search_interval);
- av1_set_mv_search_method(&ms_params, first_pass_search_sites, NSTEP);
FULLPEL_MV this_best_mv;
+ FULLPEL_MV_STATS best_mv_stats;
tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
- &this_best_mv, NULL);
+ &this_best_mv, &best_mv_stats, NULL);
if (tmp_err < INT_MAX) {
aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
@@ -744,9 +775,8 @@ static int firstpass_inter_prediction(
if ((current_frame->frame_number > 1) && golden_frame != NULL) {
FULLPEL_MV tmp_mv = kZeroFullMv;
// Assume 0,0 motion with no mv overhead.
- xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
- xd->plane[0].pre[0].stride = golden_frame->y_stride;
- xd->plane[0].pre[0].width = golden_frame->y_width;
+ av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1);
+ xd->plane[0].pre[0].buf += recon_yoffset;
gf_motion_error =
get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
&x->plane[0].src, &xd->plane[0].pre[0]);
@@ -1032,9 +1062,11 @@ static void setup_firstpass_data(AV1_COMMON *const cm,
}
}
-static void free_firstpass_data(FirstPassData *firstpass_data) {
+void av1_free_firstpass_data(FirstPassData *firstpass_data) {
aom_free(firstpass_data->raw_motion_err_list);
+ firstpass_data->raw_motion_err_list = NULL;
aom_free(firstpass_data->mb_stats);
+ firstpass_data->mb_stats = NULL;
}
int av1_get_unit_rows_in_tile(const TileInfo *tile,
@@ -1073,17 +1105,7 @@ static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
AV1_COMMON *const cm = &cpi->common;
const int tile_cols = cm->tiles.cols;
const int tile_rows = cm->tiles.rows;
- const int num_planes = av1_num_planes(&cpi->common);
- for (int plane = 0; plane < num_planes; plane++) {
- const int subsampling_xy =
- plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
- : 0;
- const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
- CHECK_MEM_ERROR(
- cm, cpi->td.mb.plane[plane].src_diff,
- (int16_t *)aom_memalign(
- 32, sizeof(*cpi->td.mb.plane[plane].src_diff) * sb_size));
- }
+
for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
TileDataEnc *const tile_data =
@@ -1091,12 +1113,6 @@ static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size);
}
}
- for (int plane = 0; plane < num_planes; plane++) {
- if (cpi->td.mb.plane[plane].src_diff) {
- aom_free(cpi->td.mb.plane[plane].src_diff);
- cpi->td.mb.plane[plane].src_diff = NULL;
- }
- }
}
void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
@@ -1187,6 +1203,16 @@ void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile);
+#if CONFIG_MULTITHREAD
+ if (cpi->ppi->p_mt_info.num_workers > 1) {
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (firstpass_mt_exit) return;
+ }
+#endif
+
if (unit_col_in_tile == 0) {
last_mv = *first_top_mv;
}
@@ -1246,7 +1272,7 @@ void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) {
setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
- free_firstpass_data(&cpi->firstpass_data);
+ av1_free_firstpass_data(&cpi->firstpass_data);
update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number,
ts_duration, BLOCK_16X16);
}
@@ -1365,6 +1391,7 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
av1_init_mode_probs(cm->fc);
av1_init_mv_probs(cm);
av1_initialize_rd_consts(cpi);
+ av1_alloc_src_diff_buf(cm, &cpi->td.mb);
enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
@@ -1382,7 +1409,8 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols;
const double raw_err_stdev =
raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count);
- free_firstpass_data(&cpi->firstpass_data);
+ av1_free_firstpass_data(&cpi->firstpass_data);
+ av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm));
// Clamp the image start to rows/2. This number of rows is discarded top
// and bottom as dead data so rows / 2 means the frame is blank.
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index e18e9e4b5..d01363a80 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -568,6 +568,8 @@ void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td,
const BLOCK_SIZE fp_block_size);
void av1_end_first_pass(struct AV1_COMP *cpi);
+void av1_free_firstpass_data(FirstPassData *firstpass_data);
+
void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
void av1_accumulate_stats(FIRSTPASS_STATS *section,
const FIRSTPASS_STATS *frame);
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index bc5e186c1..73910de12 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -30,6 +30,83 @@
// Border over which to compute the global motion
#define ERRORADV_BORDER 0
+/* clang-format off */
+// Error metric used for global motion evaluation.
+// For 8-bit input, the pixel error used to index this table will always
+// be between -255 and +255. But for 10- and 12-bit input, we use interpolation
+// which means that we need to support indices of -256 and +256 as well.
+// Therefore, the table is offset so that logical index 0 corresponds to
+// error_measure_lut[256].
+const int error_measure_lut[513] = {
+ // pow 0.7
+ 16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113,
+ 16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749,
+ 15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381,
+ 15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010,
+ 14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634,
+ 14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254,
+ 14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870,
+ 13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481,
+ 13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088,
+ 13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689,
+ 12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285,
+ 12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875,
+ 11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458,
+ 11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036,
+ 10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606,
+ 10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168,
+ 10113, 10058, 10002, 9947, 9891, 9835, 9779, 9723,
+ 9666, 9610, 9553, 9497, 9440, 9383, 9326, 9268,
+ 9211, 9153, 9095, 9037, 8979, 8921, 8862, 8804,
+ 8745, 8686, 8627, 8568, 8508, 8449, 8389, 8329,
+ 8269, 8208, 8148, 8087, 8026, 7965, 7903, 7842,
+ 7780, 7718, 7656, 7593, 7531, 7468, 7405, 7341,
+ 7278, 7214, 7150, 7086, 7021, 6956, 6891, 6826,
+ 6760, 6695, 6628, 6562, 6495, 6428, 6361, 6293,
+ 6225, 6157, 6089, 6020, 5950, 5881, 5811, 5741,
+ 5670, 5599, 5527, 5456, 5383, 5311, 5237, 5164,
+ 5090, 5015, 4941, 4865, 4789, 4713, 4636, 4558,
+ 4480, 4401, 4322, 4242, 4162, 4080, 3998, 3916,
+ 3832, 3748, 3663, 3577, 3490, 3402, 3314, 3224,
+ 3133, 3041, 2948, 2854, 2758, 2661, 2562, 2461,
+ 2359, 2255, 2148, 2040, 1929, 1815, 1698, 1577,
+ 1452, 1323, 1187, 1045, 894, 731, 550, 339,
+ 0, 339, 550, 731, 894, 1045, 1187, 1323,
+ 1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255,
+ 2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041,
+ 3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748,
+ 3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401,
+ 4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015,
+ 5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599,
+ 5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157,
+ 6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695,
+ 6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214,
+ 7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718,
+ 7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208,
+ 8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686,
+ 8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153,
+ 9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610,
+ 9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058,
+ 10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497,
+ 10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929,
+ 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353,
+ 11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771,
+ 11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183,
+ 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
+ 12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988,
+ 13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383,
+ 13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773,
+ 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159,
+ 14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539,
+ 14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916,
+ 14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289,
+ 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657,
+ 15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022,
+ 16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384,
+ 16384,
+};
+/* clang-format on */
+
int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
return best_erroradvantage < erroradv_tr &&
best_erroradvantage * params_cost < erroradv_prod_tr;
@@ -110,15 +187,76 @@ static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
}
#if CONFIG_AV1_HIGHBITDEPTH
-static int64_t highbd_warp_error(
- WarpedMotionParams *wm, const uint16_t *const ref, int width, int height,
- int stride, const uint16_t *const dst, int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
- int64_t best_error, uint8_t *segment_map, int segment_map_stride) {
+static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
+ const uint16_t *const dst, int dst_stride,
+ int p_width, int p_height) {
+ // This function should only be called for patches smaller than
+ // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+ // small enough that we don't need a 64-bit accumulator
+ assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+ int sad = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+ }
+ }
+ return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_segmented_frame_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t highbd_segmented_frame_error(
+ const uint16_t *const ref, int ref_stride, const uint16_t *const dst,
+ int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map,
+ int segment_map_stride) {
+ (void)bd;
+ int patch_w, patch_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+ if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+ sum_error += aom_highbd_sad32x32(
+ CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride,
+ CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+ } else {
+ sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride,
+ patch_w, patch_h);
+ }
+ }
+ }
+ return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t highbd_warp_error(WarpedMotionParams *wm,
+ const uint16_t *const ref, int ref_width,
+ int ref_height, int ref_stride,
+ const uint16_t *const dst, int dst_stride,
+ int p_col, int p_row, int p_width,
+ int p_height, int subsampling_x,
+ int subsampling_y, int bd, int64_t best_error,
+ uint8_t *segment_map, int segment_map_stride) {
int64_t gm_sumerr = 0;
const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
- uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+ DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
ConvolveParams conv_params = get_conv_params(0, 0, bd);
conv_params.use_dist_wtd_comp_avg = 0;
@@ -131,14 +269,22 @@ static int64_t highbd_warp_error(
if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
// avoid warping extra 8x8 blocks in the padded region of the frame
// when p_width and p_height are not multiples of WARP_ERROR_BLOCK
- const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
- const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
- highbd_warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w,
- warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
- bd, &conv_params);
- gm_sumerr += av1_calc_highbd_frame_error(tmp, WARP_ERROR_BLOCK,
- dst + j + i * p_stride, warp_w,
- warp_h, p_stride, bd);
+ const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+ const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+ highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i,
+ warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x,
+ subsampling_y, bd, &conv_params);
+
+ if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+ gm_sumerr += aom_highbd_sad32x32(
+ CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK,
+ CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+ } else {
+ gm_sumerr +=
+ generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+ dst_stride, warp_w, warp_h);
+ }
+
if (gm_sumerr > best_error) return INT64_MAX;
}
}
@@ -146,10 +292,67 @@ static int64_t highbd_warp_error(
}
#endif
+static INLINE int generic_sad(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int dst_stride,
+ int p_width, int p_height) {
+ // This function should only be called for patches smaller than
+ // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+ // small enough that we don't need a 64-bit accumulator
+ assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+ int sad = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+ }
+ }
+ return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in segmented_warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+ int patch_w, patch_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+ if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+ sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride);
+ } else {
+ sum_error +=
+ generic_sad(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride, patch_w, patch_h);
+ }
+ }
+ }
+ return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in warp_error"
+#endif // WARP_ERROR_BLOCK != 32
static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
- int width, int height, int stride,
- const uint8_t *const dst, int p_col, int p_row,
- int p_width, int p_height, int p_stride,
+ int ref_width, int ref_height, int ref_stride,
+ const uint8_t *const dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height,
int subsampling_x, int subsampling_y,
int64_t best_error, uint8_t *segment_map,
int segment_map_stride) {
@@ -170,62 +373,72 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
// avoid warping extra 8x8 blocks in the padded region of the frame
// when p_width and p_height are not multiples of WARP_ERROR_BLOCK
- warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
- warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
- warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
- WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
-
- gm_sumerr +=
- av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
- warp_w, warp_h, p_stride);
+ warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+ warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+ warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w,
+ warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
+ &conv_params);
+
+ if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+ gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK,
+ dst + j + i * dst_stride, dst_stride);
+ } else {
+ gm_sumerr +=
+ generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+ dst_stride, warp_w, warp_h);
+ }
+
if (gm_sumerr > best_error) return INT64_MAX;
}
}
return gm_sumerr;
}
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int ref_stride, uint8_t *dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ return highbd_segmented_frame_error(
+ CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst),
+ dst_stride, p_width, p_height, bd, segment_map, segment_map_stride);
+ }
+#endif
+ (void)use_hbd;
+ (void)bd;
+ return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width,
+ p_height, segment_map, segment_map_stride);
+}
+
int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
- const uint8_t *ref, int width, int height, int stride,
- uint8_t *dst, int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x,
+ const uint8_t *ref, int ref_width, int ref_height,
+ int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height, int subsampling_x,
int subsampling_y, int64_t best_error,
uint8_t *segment_map, int segment_map_stride) {
- force_wmtype(wm, wm->wmtype);
- assert(wm->wmtype <= AFFINE);
if (!av1_get_shear_params(wm)) return INT64_MAX;
#if CONFIG_AV1_HIGHBITDEPTH
if (use_hbd)
- return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), width, height,
- stride, CONVERT_TO_SHORTPTR(dst), p_col, p_row,
- p_width, p_height, p_stride, subsampling_x,
- subsampling_y, bd, best_error, segment_map,
- segment_map_stride);
+ return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width,
+ ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst),
+ dst_stride, p_col, p_row, p_width, p_height,
+ subsampling_x, subsampling_y, bd, best_error,
+ segment_map, segment_map_stride);
#endif
(void)use_hbd;
(void)bd;
- return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
- p_height, p_stride, subsampling_x, subsampling_y,
- best_error, segment_map, segment_map_stride);
-}
-
-// Factors used to calculate the thresholds for av1_warp_error
-static double thresh_factors[GM_MAX_REFINEMENT_STEPS] = { 1.25, 1.20, 1.15,
- 1.10, 1.05 };
-
-static INLINE int64_t calc_approx_erroradv_threshold(
- double scaling_factor, int64_t erroradv_threshold) {
- return erroradv_threshold <
- (int64_t)(((double)INT64_MAX / scaling_factor) + 0.5)
- ? (int64_t)(scaling_factor * erroradv_threshold + 0.5)
- : INT64_MAX;
+ return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride,
+ p_col, p_row, p_width, p_height, subsampling_x,
+ subsampling_y, best_error, segment_map, segment_map_stride);
}
int64_t av1_refine_integerized_param(
WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
int d_width, int d_height, int d_stride, int n_refinements,
- int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
- int64_t erroradv_threshold) {
+ int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) {
static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
const int border = ERRORADV_BORDER;
int i = 0, p;
@@ -238,36 +451,51 @@ int64_t av1_refine_integerized_param(
int32_t best_param;
force_wmtype(wm, wmtype);
+ wm->wmtype = get_wmtype(wm);
+
+ if (n_refinements == 0) {
+ // Compute the maximum error value that will be accepted, so that
+ // av1_warp_error can terminate early if it proves the model will not
+ // be accepted.
+ int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr);
+ return av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border,
+ 0, 0, selection_threshold, segment_map,
+ segment_map_stride);
+ }
+
+ // When refining, use a slightly higher threshold for the initial error
+ // calculation - see comment above erroradv_early_tr for why.
+ int64_t selection_threshold =
+ (int64_t)lrint(ref_frame_error * erroradv_early_tr);
best_error =
av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
- dst + border * d_stride + border, border, border,
- d_width - 2 * border, d_height - 2 * border, d_stride, 0,
- 0, best_frame_error, segment_map, segment_map_stride);
+ dst + border * d_stride + border, d_stride, border, border,
+ d_width - 2 * border, d_height - 2 * border, 0, 0,
+ selection_threshold, segment_map, segment_map_stride);
- if (n_refinements == 0) {
- wm->wmtype = get_wmtype(wm);
- return best_error;
+ if (best_error > selection_threshold) {
+ return INT64_MAX;
}
- best_error = AOMMIN(best_error, best_frame_error);
step = 1 << (n_refinements - 1);
for (i = 0; i < n_refinements; i++, step >>= 1) {
- int64_t error_adv_thresh =
- calc_approx_erroradv_threshold(thresh_factors[i], erroradv_threshold);
for (p = 0; p < n_params; ++p) {
int step_dir = 0;
- // Skip searches for parameters that are forced to be 0
param = param_mat + p;
curr_param = *param;
best_param = curr_param;
// look to the left
+ // Note: We have to use force_wmtype() to keep the proper symmetry for
+ // ROTZOOM type models
*param = add_param_offset(p, curr_param, -step);
+ force_wmtype(wm, wmtype);
step_error =
av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
- dst + border * d_stride + border, border, border,
- d_width - 2 * border, d_height - 2 * border, d_stride,
- 0, 0, AOMMIN(best_error, error_adv_thresh),
- segment_map, segment_map_stride);
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border, 0,
+ 0, best_error, segment_map, segment_map_stride);
if (step_error < best_error) {
best_error = step_error;
best_param = *param;
@@ -276,40 +504,42 @@ int64_t av1_refine_integerized_param(
// look to the right
*param = add_param_offset(p, curr_param, step);
+ force_wmtype(wm, wmtype);
step_error =
av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
- dst + border * d_stride + border, border, border,
- d_width - 2 * border, d_height - 2 * border, d_stride,
- 0, 0, AOMMIN(best_error, error_adv_thresh),
- segment_map, segment_map_stride);
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border, 0,
+ 0, best_error, segment_map, segment_map_stride);
if (step_error < best_error) {
best_error = step_error;
best_param = *param;
step_dir = 1;
}
- *param = best_param;
// look to the direction chosen above repeatedly until error increases
// for the biggest step size
while (step_dir) {
*param = add_param_offset(p, best_param, step * step_dir);
+ force_wmtype(wm, wmtype);
step_error =
av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
- dst + border * d_stride + border, border, border,
- d_width - 2 * border, d_height - 2 * border,
- d_stride, 0, 0, AOMMIN(best_error, error_adv_thresh),
- segment_map, segment_map_stride);
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border,
+ 0, 0, best_error, segment_map, segment_map_stride);
if (step_error < best_error) {
best_error = step_error;
best_param = *param;
} else {
- *param = best_param;
step_dir = 0;
}
}
+
+ // Restore best parameter value so far
+ *param = best_param;
+ force_wmtype(wm, wmtype);
}
}
- force_wmtype(wm, wmtype);
+
wm->wmtype = get_wmtype(wm);
return best_error;
}
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index cf1d0fd02..8c9c60f0f 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -40,7 +40,7 @@ typedef struct {
// Pointer to hold inliers from motion model.
uint8_t *segment_map;
-} GlobalMotionThreadData;
+} GlobalMotionData;
typedef struct {
// Holds the mapping of each thread to past/future direction.
@@ -63,43 +63,82 @@ typedef struct {
// Data related to assigning jobs for global motion multi-threading.
JobInfo job_info;
- // Data specific to each worker in global motion multi-threading.
- // thread_data[i] stores the thread specific data for worker 'i'.
- GlobalMotionThreadData *thread_data;
-
#if CONFIG_MULTITHREAD
// Mutex lock used while dispatching jobs.
pthread_mutex_t *mutex_;
#endif
- // Width and height for which segment_map is allocated for each thread.
- int allocated_width;
- int allocated_height;
-
- // Number of workers for which thread_data is allocated.
- int8_t allocated_workers;
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool gm_mt_exit;
} AV1GlobalMotionSync;
void av1_convert_model_to_params(const double *params,
WarpedMotionParams *model);
-// TODO(sarahparker) These need to be retuned for speed 0 and 1 to
-// maximize gains from segmented error metric
+// Criteria for accepting a global motion model
static const double erroradv_tr = 0.65;
static const double erroradv_prod_tr = 20000;
+// Early exit threshold for global motion refinement
+// This is set slightly higher than erroradv_tr, as a compromise between
+// two factors:
+//
+// 1) By rejecting un-promising models early, we can reduce the encode time
+// spent trying to refine them
+//
+// 2) When we refine a model, its error may decrease to below the acceptance
+// threshold even if the model is initially above the threshold
+static const double erroradv_early_tr = 0.70;
+
int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost);
void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
int height, int *inliers,
int num_inliers);
+extern const int error_measure_lut[513];
+
+static INLINE int error_measure(int err) {
+ return error_measure_lut[256 + err];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_error_measure(int err, int bd) {
+ const int b = bd - 8;
+ const int bmask = (1 << b) - 1;
+ const int v = (1 << b);
+
+ // Split error into two parts and do an interpolated table lookup
+ // To compute the table index and interpolation value, we want to calculate
+ // the quotient and remainder of err / 2^b. But it is very important that
+ // the division must round down, and the remainder must be positive,
+ // ie. in the range [0, 2^b).
+ //
+ // In C, the >> and & operators do what we want, but the / and % operators
+ // give the wrong results for negative inputs. So we must use >> and & here.
+ //
+ // For example, if bd == 10 and err == -5, compare the results:
+ // (-5) >> 2 = -2, (-5) & 3 = 3
+ // vs. (-5) / 4 = -1, (-5) % 4 = -1
+ const int e1 = err >> b;
+ const int e2 = err & bmask;
+ return error_measure_lut[256 + e1] * (v - e2) +
+ error_measure_lut[257 + e1] * e2;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int ref_stride, uint8_t *dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map, int segment_map_stride);
+
// Returns the error between the result of applying motion 'wm' to the frame
// described by 'ref' and the frame described by 'dst'.
int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
- const uint8_t *ref, int width, int height, int stride,
- uint8_t *dst, int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x,
+ const uint8_t *ref, int ref_width, int ref_height,
+ int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height, int subsampling_x,
int subsampling_y, int64_t best_error,
uint8_t *segment_map, int segment_map_stride);
@@ -110,8 +149,7 @@ int64_t av1_refine_integerized_param(
WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
int d_width, int d_height, int d_stride, int n_refinements,
- int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
- int64_t erroradv_threshold);
+ int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride);
#ifdef __cplusplus
} // extern "C"
diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
index 1a00cbba4..02a4e70ed 100644
--- a/av1/encoder/global_motion_facade.c
+++ b/av1/encoder/global_motion_facade.c
@@ -20,8 +20,9 @@
#include "av1/encoder/rdopt.h"
#include "av1/encoder/global_motion_facade.h"
-// Highest motion model to search.
-#define GLOBAL_TRANS_TYPES_ENC 3
+// Range of model types to search
+#define FIRST_GLOBAL_TRANS_TYPE ROTZOOM
+#define LAST_GLOBAL_TRANS_TYPE ROTZOOM
// Computes the cost for the warp parameters.
static int gm_get_params_cost(const WarpedMotionParams *gm,
@@ -73,47 +74,46 @@ static int gm_get_params_cost(const WarpedMotionParams *gm,
return (params_cost << AV1_PROB_COST_SHIFT);
}
-// Calculates the threshold to be used for warp error computation.
-static AOM_INLINE int64_t calc_erroradv_threshold(int64_t ref_frame_error) {
- return (int64_t)(ref_frame_error * erroradv_tr + 0.5);
-}
-
// For the given reference frame, computes the global motion parameters for
// different motion models and finds the best.
static AOM_INLINE void compute_global_motion_for_ref_frame(
- AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
const int segment_map_h, const WarpedMotionParams *ref_params) {
- ThreadData *const td = &cpi->td;
- MACROBLOCK *const x = &td->mb;
AV1_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
- int i;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
int src_width = cpi->source->y_crop_width;
int src_height = cpi->source->y_crop_height;
int src_stride = cpi->source->y_stride;
- WarpedMotionParams tmp_wm_params;
- const double *params_this_motion;
assert(ref_buf[frame] != NULL);
- TransformationType model;
int bit_depth = cpi->common.seq_params->bit_depth;
GlobalMotionMethod global_motion_method = default_global_motion_method;
int num_refinements = cpi->sf.gm_sf.num_refinement_steps;
-
- for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
- if (!aom_compute_global_motion(model, cpi->source, ref_buf[frame],
- bit_depth, global_motion_method,
- motion_models, RANSAC_NUM_MOTIONS)) {
+ bool mem_alloc_failed = false;
+
+ // Select the best model based on fractional error reduction.
+ // By initializing this to erroradv_tr, the same logic which is used to
+ // select the best model will automatically filter out any model which
+ // doesn't meet the required quality threshold
+ double best_erroradv = erroradv_tr;
+ for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE;
+ model <= LAST_GLOBAL_TRANS_TYPE; ++model) {
+ if (!aom_compute_global_motion(
+ model, cpi->source, ref_buf[frame], bit_depth, global_motion_method,
+ motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) {
+ if (mem_alloc_failed) {
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate global motion buffers");
+ }
continue;
}
- int64_t best_ref_frame_error = 0;
- int64_t best_warp_error = INT64_MAX;
- for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+ for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
if (motion_models[i].num_inliers == 0) continue;
- params_this_motion = motion_models[i].params;
- av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
+ WarpedMotionParams tmp_wm_params;
+ av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params);
// Skip models that we won't use (IDENTITY or TRANSLATION)
//
@@ -133,29 +133,26 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
int64_t ref_frame_error = av1_segmented_frame_error(
is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
- ref_buf[frame]->y_stride, cpi->source->y_buffer, src_width,
- src_height, src_stride, segment_map, segment_map_w);
+ ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride,
+ src_width, src_height, segment_map, segment_map_w);
if (ref_frame_error == 0) continue;
- const int64_t erroradv_threshold =
- calc_erroradv_threshold(ref_frame_error);
-
const int64_t warp_error = av1_refine_integerized_param(
&tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width,
ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride,
cpi->source->y_buffer, src_width, src_height, src_stride,
- num_refinements, best_warp_error, segment_map, segment_map_w,
- erroradv_threshold);
+ num_refinements, ref_frame_error, segment_map, segment_map_w);
// av1_refine_integerized_param() can return a simpler model type than
// its input, so re-check model type here
if (tmp_wm_params.wmtype <= TRANSLATION) continue;
- if (warp_error < best_warp_error) {
- best_ref_frame_error = ref_frame_error;
- best_warp_error = warp_error;
+ double erroradvantage = (double)warp_error / ref_frame_error;
+
+ if (erroradvantage < best_erroradv) {
+ best_erroradv = erroradvantage;
// Save the wm_params modified by
// av1_refine_integerized_param() rather than motion index to
// avoid rerunning refine() below.
@@ -163,47 +160,41 @@ static AOM_INLINE void compute_global_motion_for_ref_frame(
sizeof(WarpedMotionParams));
}
}
- assert(cm->global_motion[frame].wmtype <= AFFINE);
- if (!av1_get_shear_params(&cm->global_motion[frame]))
- cm->global_motion[frame] = default_warp_params;
+ }
+
+ if (!av1_get_shear_params(&cm->global_motion[frame]))
+ cm->global_motion[frame] = default_warp_params;
#if 0
- // We never choose translational models, so this code is disabled
- if (cm->global_motion[frame].wmtype == TRANSLATION) {
- cm->global_motion[frame].wmmat[0] =
- convert_to_trans_prec(cm->features.allow_high_precision_mv,
- cm->global_motion[frame].wmmat[0]) *
- GM_TRANS_ONLY_DECODE_FACTOR;
- cm->global_motion[frame].wmmat[1] =
- convert_to_trans_prec(cm->features.allow_high_precision_mv,
- cm->global_motion[frame].wmmat[1]) *
- GM_TRANS_ONLY_DECODE_FACTOR;
- }
+ // We never choose translational models, so this code is disabled
+ if (cm->global_motion[frame].wmtype == TRANSLATION) {
+ cm->global_motion[frame].wmmat[0] =
+ convert_to_trans_prec(cm->features.allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[0]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ cm->global_motion[frame].wmmat[1] =
+ convert_to_trans_prec(cm->features.allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[1]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ }
#endif
- if (cm->global_motion[frame].wmtype == IDENTITY) continue;
-
- // Once we get here, best_ref_frame_error must be > 0. This is because
- // of the logic above, which skips over any models which have
- // ref_frame_error == 0
- assert(best_ref_frame_error > 0);
-
- // If the best error advantage found doesn't meet the threshold for
- // this motion type, revert to IDENTITY.
- if (!av1_is_enough_erroradvantage(
- (double)best_warp_error / best_ref_frame_error,
- gm_get_params_cost(&cm->global_motion[frame], ref_params,
- cm->features.allow_high_precision_mv))) {
- cm->global_motion[frame] = default_warp_params;
- }
+ if (cm->global_motion[frame].wmtype == IDENTITY) return;
- if (cm->global_motion[frame].wmtype != IDENTITY) break;
+ // If the best error advantage found doesn't meet the threshold for
+ // this motion type, revert to IDENTITY.
+ if (!av1_is_enough_erroradvantage(
+ best_erroradv,
+ gm_get_params_cost(&cm->global_motion[frame], ref_params,
+ cm->features.allow_high_precision_mv))) {
+ cm->global_motion[frame] = default_warp_params;
}
}
// Computes global motion for the given reference frame.
void av1_compute_gm_for_valid_ref_frames(
- AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
int segment_map_h) {
AV1_COMMON *const cm = &cpi->common;
@@ -211,9 +202,9 @@ void av1_compute_gm_for_valid_ref_frames(
cm->prev_frame ? &cm->prev_frame->global_motion[frame]
: &default_warp_params;
- compute_global_motion_for_ref_frame(cpi, ref_buf, frame, motion_models,
- segment_map, segment_map_w, segment_map_h,
- ref_params);
+ compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame,
+ motion_models, segment_map, segment_map_w,
+ segment_map_h, ref_params);
}
// Loops over valid reference frames and computes global motion estimation.
@@ -223,13 +214,15 @@ static AOM_INLINE void compute_global_motion_for_references(
MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
const int segment_map_h) {
AV1_COMMON *const cm = &cpi->common;
+ struct aom_internal_error_info *const error_info =
+ cpi->td.mb.e_mbd.error_info;
// Compute global motion w.r.t. reference frames starting from the nearest ref
// frame in a given direction.
for (int frame = 0; frame < num_ref_frames; frame++) {
int ref_frame = reference_frame[frame].frame;
- av1_compute_gm_for_valid_ref_frames(cpi, ref_buf, ref_frame, motion_models,
- segment_map, segment_map_w,
- segment_map_h);
+ av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame,
+ motion_models, segment_map,
+ segment_map_w, segment_map_h);
// If global motion w.r.t. current ref frame is
// INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
// the remaining ref frames in that direction.
@@ -361,40 +354,6 @@ static AOM_INLINE void update_valid_ref_frames_for_gm(
}
}
-// Deallocates segment_map and inliers.
-static AOM_INLINE void dealloc_global_motion_data(MotionModel *motion_models,
- uint8_t *segment_map) {
- aom_free(segment_map);
-
- for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
- aom_free(motion_models[m].inliers);
- }
-}
-
-// Allocates and initializes memory for segment_map and MotionModel.
-static AOM_INLINE bool alloc_global_motion_data(MotionModel *motion_models,
- uint8_t **segment_map,
- const int segment_map_w,
- const int segment_map_h) {
- av1_zero_array(motion_models, RANSAC_NUM_MOTIONS);
- for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
- motion_models[m].inliers =
- aom_malloc(sizeof(*(motion_models[m].inliers)) * 2 * MAX_CORNERS);
- if (!motion_models[m].inliers) {
- dealloc_global_motion_data(motion_models, NULL);
- return false;
- }
- }
-
- *segment_map = (uint8_t *)aom_calloc(segment_map_w * segment_map_h,
- sizeof(*segment_map));
- if (!*segment_map) {
- dealloc_global_motion_data(motion_models, NULL);
- return false;
- }
- return true;
-}
-
// Initializes parameters used for computing global motion.
static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
GlobalMotionInfo *const gm_info = &cpi->gm_info;
@@ -439,11 +398,7 @@ static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
// Computes global motion w.r.t. valid reference frames.
static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
GlobalMotionInfo *const gm_info = &cpi->gm_info;
- MotionModel motion_models[RANSAC_NUM_MOTIONS];
- uint8_t *segment_map = NULL;
-
- alloc_global_motion_data(motion_models, &segment_map, gm_info->segment_map_w,
- gm_info->segment_map_h);
+ GlobalMotionData *gm_data = &cpi->td.gm_data;
// Compute global motion w.r.t. past reference frames and future reference
// frames
@@ -451,11 +406,9 @@ static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
if (gm_info->num_ref_frames[dir] > 0)
compute_global_motion_for_references(
cpi, gm_info->ref_buf, gm_info->reference_frames[dir],
- gm_info->num_ref_frames[dir], motion_models, segment_map,
- gm_info->segment_map_w, gm_info->segment_map_h);
+ gm_info->num_ref_frames[dir], gm_data->motion_models,
+ gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h);
}
-
- dealloc_global_motion_data(motion_models, segment_map);
}
// Global motion estimation for the current frame is computed.This computation
@@ -478,13 +431,19 @@ void av1_compute_global_motion_facade(AV1_COMP *cpi) {
}
if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
- cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done) {
+ cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done &&
+ cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) {
setup_global_motion_info_params(cpi);
- if (cpi->mt_info.num_workers > 1)
- av1_global_motion_estimation_mt(cpi);
- else
- global_motion_estimation(cpi);
- gm_info->search_done = 1;
+ // Terminate early if the total number of reference frames is zero.
+ if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) {
+ gm_alloc_data(cpi, &cpi->td.gm_data);
+ if (cpi->mt_info.num_workers > 1)
+ av1_global_motion_estimation_mt(cpi);
+ else
+ global_motion_estimation(cpi);
+ gm_dealloc_data(&cpi->td.gm_data);
+ gm_info->search_done = 1;
+ }
}
memcpy(cm->cur_frame->global_motion, cm->global_motion,
sizeof(cm->cur_frame->global_motion));
diff --git a/av1/encoder/global_motion_facade.h b/av1/encoder/global_motion_facade.h
index dfdedf715..f13989aa2 100644
--- a/av1/encoder/global_motion_facade.h
+++ b/av1/encoder/global_motion_facade.h
@@ -18,8 +18,36 @@ extern "C" {
struct yv12_buffer_config;
struct AV1_COMP;
+// Allocates memory for members of GlobalMotionData.
+static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
+ AV1_COMMON *cm = &cpi->common;
+ GlobalMotionInfo *gm_info = &cpi->gm_info;
+
+ CHECK_MEM_ERROR(cm, gm_data->segment_map,
+ aom_malloc(sizeof(*gm_data->segment_map) *
+ gm_info->segment_map_w * gm_info->segment_map_h));
+
+ av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS);
+ for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+ CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers,
+ aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 *
+ MAX_CORNERS));
+ }
+}
+
+// Deallocates the memory allocated for members of GlobalMotionData.
+static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) {
+ aom_free(gm_data->segment_map);
+ gm_data->segment_map = NULL;
+ for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+ aom_free(gm_data->motion_models[m].inliers);
+ gm_data->motion_models[m].inliers = NULL;
+ }
+}
+
void av1_compute_gm_for_valid_ref_frames(
- AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
int segment_map_h);
void av1_compute_global_motion_facade(struct AV1_COMP *cpi);
diff --git a/av1/encoder/hash_motion.c b/av1/encoder/hash_motion.c
index 164aa0978..8b04e22d6 100644
--- a/av1/encoder/hash_motion.c
+++ b/av1/encoder/hash_motion.c
@@ -128,7 +128,7 @@ bool av1_hash_table_create(hash_table *p_hash_table) {
}
p_hash_table->p_lookup_table =
(Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0]));
- if (!p_hash_table) return false;
+ if (!p_hash_table->p_lookup_table) return false;
return true;
}
@@ -141,13 +141,16 @@ static bool hash_table_add_to_table(hash_table *p_hash_table,
if (p_hash_table->p_lookup_table[hash_value] == NULL) {
return false;
}
- aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
- sizeof(curr_block_hash[0]));
- aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
- curr_block_hash);
+ if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+ sizeof(curr_block_hash[0])) == VECTOR_ERROR)
+ return false;
+ if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash) == VECTOR_ERROR)
+ return false;
} else {
- aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
- curr_block_hash);
+ if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash) == VECTOR_ERROR)
+ return false;
}
return true;
}
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 4c2f8d039..a108e8148 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -312,17 +312,51 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+ ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ switch (tx_size) {
+ // As the output transform co-efficients of 4x4 Hadamard transform can be
+ // represented using 15 bits (for 12-bit clip) use lowbd variant of
+ // hadamard_4x4.
+ case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+ case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break;
+ case TX_16X16:
+ aom_highbd_hadamard_16x16(src_diff, src_stride, coeff);
+ break;
+ case TX_32X32:
+ aom_highbd_hadamard_32x32(src_diff, src_stride, coeff);
+ break;
+ default: assert(0);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ switch (tx_size) {
+ case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+ case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+ case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+ case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+ default: assert(0);
+ }
+}
+
void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
const int16_t *src_diff, int src_stride,
tran_low_t *coeff) {
if (use_hadamard) {
- switch (tx_size) {
- case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
- case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
- case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
- case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
- default: assert(0);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bd_info.use_highbitdepth_buf) {
+ highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+ } else {
+ wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
}
+#else
+ wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+#endif // CONFIG_AV1_HIGHBITDEPTH
} else {
TxfmParam txfm_param;
txfm_param.tx_type = DCT_DCT;
diff --git a/av1/encoder/interp_search.c b/av1/encoder/interp_search.c
index 247fa3e9d..27235303c 100644
--- a/av1/encoder/interp_search.c
+++ b/av1/encoder/interp_search.c
@@ -662,8 +662,7 @@ int64_t av1_interpolation_filter_search(
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- const int need_search =
- av1_is_interp_needed(xd) && !cpi->sf.rt_sf.skip_interp_filter_search;
+ const int need_search = av1_is_interp_needed(xd);
const int ref_frame = xd->mi[0]->ref_frame[0];
RD_STATS rd_stats_luma, rd_stats;
diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index bce494ed3..9815e0bcf 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h
@@ -109,6 +109,11 @@ typedef struct HandleInterModeArgs {
*/
int skip_motion_mode;
/*!
+ * Initialized to false. If true, skips interpolation filter search and uses
+ * the default EIGHTTAP_REGULAR.
+ */
+ bool skip_ifs;
+ /*!
* A pointer to the first element in an array of INTERINTRA_MODE types. This
* contains the best inter_intra mode for each reference frame.
*/
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 3b5dd758f..99b0af2f8 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -874,16 +874,17 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
int this_rate;
RD_STATS tokenonly_rd_stats;
- UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
+ UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx];
// Skip the current mode evaluation if the RD cost derived using the mode
// signaling rate exceeds the best_rd so far.
const int mode_rate =
- mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][mode];
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue;
- const int is_diagonal_mode = av1_is_diagonal_mode(get_uv_mode(mode));
- const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
+ PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode);
+ const int is_directional_mode = av1_is_directional_mode(intra_mode);
if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
continue;
@@ -892,25 +893,26 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
continue;
if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
- (1 << mode)))
+ (1 << uv_mode)))
continue;
- if (!intra_mode_cfg->enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
- mode <= UV_SMOOTH_H_PRED)
+ if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED &&
+ uv_mode <= UV_SMOOTH_H_PRED)
continue;
- if (!intra_mode_cfg->enable_paeth_intra && mode == UV_PAETH_PRED) continue;
+ if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED)
+ continue;
assert(mbmi->mode < INTRA_MODES);
if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner &&
- !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << mode)))
+ !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode)))
continue;
- mbmi->uv_mode = mode;
+ mbmi->uv_mode = uv_mode;
// Init variables for cfl and angle delta
const SPEED_FEATURES *sf = &cpi->sf;
mbmi->angle_delta[PLANE_TYPE_UV] = 0;
- if (mode == UV_CFL_PRED) {
+ if (uv_mode == UV_CFL_PRED) {
if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue;
assert(!is_directional_mode);
const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
@@ -936,18 +938,18 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
intra_search_state.directional_mode_skip_mask, is_chroma);
intra_search_state.dir_mode_skip_mask_ready = 1;
}
- if (intra_search_state.directional_mode_skip_mask[mode]) {
+ if (intra_search_state.directional_mode_skip_mask[uv_mode]) {
continue;
}
// Search through angle delta
const int rate_overhead =
- mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][mode];
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
&this_rate, &tokenonly_rd_stats))
continue;
} else {
- if (mode == UV_SMOOTH_PRED &&
+ if (uv_mode == UV_SMOOTH_PRED &&
should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x,
bsize))
continue;
@@ -958,7 +960,7 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
}
const int mode_cost =
- mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][mode];
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
this_rate = tokenonly_rd_stats.rate +
intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 4519e4629..107c2236f 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -576,13 +576,13 @@ static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
int total_rate = mode_cost;
const ModeCosts *mode_costs = &x->mode_costs;
const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
- const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
// Can only activate one mode.
- assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+ assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
const int try_palette = av1_allow_palette(
cpi->common.features.allow_screen_content_tools, mbmi->bsize);
- if (try_palette && mode == UV_DC_PRED) {
+ if (try_palette && uv_mode == UV_DC_PRED) {
const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
total_rate +=
mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
@@ -604,10 +604,11 @@ static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
total_rate += palette_mode_cost;
}
}
- if (av1_is_directional_mode(get_uv_mode(mode))) {
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (av1_is_directional_mode(intra_mode)) {
if (av1_use_angle_delta(bsize)) {
total_rate +=
- mode_costs->angle_delta_cost[mode - V_PRED]
+ mode_costs->angle_delta_cost[intra_mode - V_PRED]
[mbmi->angle_delta[PLANE_TYPE_UV] +
MAX_ANGLE_DELTA];
}
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cc39c8163..2462f1b15 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -61,30 +61,6 @@ static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
ms_buffers->obmc_mask = x->obmc_buffer.mask;
}
-static AOM_INLINE SEARCH_METHODS
-get_faster_search_method(SEARCH_METHODS search_method) {
- // Note on search method's accuracy:
- // 1. NSTEP
- // 2. DIAMOND
- // 3. BIGDIA \approx SQUARE
- // 4. HEX.
- // 5. FAST_HEX \approx FAST_DIAMOND
- switch (search_method) {
- case NSTEP: return DIAMOND;
- case NSTEP_8PT: return DIAMOND;
- case DIAMOND: return BIGDIA;
- case CLAMPED_DIAMOND: return BIGDIA;
- case BIGDIA: return HEX;
- case SQUARE: return HEX;
- case HEX: return FAST_HEX;
- case FAST_HEX: return FAST_HEX;
- case FAST_DIAMOND: return VFAST_DIAMOND;
- case FAST_BIGDIA: return FAST_BIGDIA;
- case VFAST_DIAMOND: return VFAST_DIAMOND;
- default: assert(0 && "Invalid search method!"); return DIAMOND;
- }
-}
-
void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
obmc_buffer->wsrc = NULL;
obmc_buffer->mask = NULL;
@@ -96,7 +72,7 @@ void av1_make_default_fullpel_ms_params(
FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
- int fine_search_interval) {
+ SEARCH_METHODS search_method, int fine_search_interval) {
const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
const int is_key_frame =
cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE;
@@ -107,28 +83,6 @@ void av1_make_default_fullpel_ms_params(
init_ms_buffers(&ms_params->ms_buffers, x);
- SEARCH_METHODS search_method = mv_sf->search_method;
- const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
- const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
- const int qband = x->qindex >> (QINDEX_BITS - 2);
- const bool use_faster_search_method =
- (sf_blk_search_method == 1 && min_dim >= 32) ||
- (sf_blk_search_method >= 2 && min_dim >= 16 &&
- x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
-
- if (use_faster_search_method) {
- search_method = get_faster_search_method(search_method);
-
- // We might need to update the search site config since search_method
- // is changed here.
- const int ref_stride = ms_params->ms_buffers.ref->stride;
- if (ref_stride != search_sites[search_method].stride) {
- av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
- ref_stride);
- search_sites = x->search_site_cfg_buf;
- }
- }
-
av1_set_mv_search_method(ms_params, search_sites, search_method);
ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
@@ -700,7 +654,8 @@ static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
}
static INLINE int get_mvpred_var_cost(
- const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+ FULLPEL_MV_STATS *mv_stats) {
const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
const MV sub_this_mv = get_mv_from_fullmv(this_mv);
const struct buf_2d *const src = ms_params->ms_buffers.src;
@@ -709,13 +664,14 @@ static INLINE int get_mvpred_var_cost(
const int src_stride = src->stride;
const int ref_stride = ref->stride;
- unsigned unused;
int bestsme;
bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
- ref_stride, &unused);
+ ref_stride, &mv_stats->sse);
+ mv_stats->distortion = bestsme;
- bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ bestsme += mv_stats->err_cost;
return bestsme;
}
@@ -731,7 +687,8 @@ static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
}
static INLINE int get_mvpred_compound_var_cost(
- const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+ FULLPEL_MV_STATS *mv_stats) {
const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
const struct buf_2d *const src = ms_params->ms_buffers.src;
const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -743,23 +700,24 @@ static INLINE int get_mvpred_compound_var_cost(
const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
const int mask_stride = ms_params->ms_buffers.mask_stride;
const int invert_mask = ms_params->ms_buffers.inv_mask;
- unsigned unused;
int bestsme;
if (mask) {
bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
src_buf, src_stride, second_pred, mask, mask_stride,
- invert_mask, &unused);
+ invert_mask, &mv_stats->sse);
} else if (second_pred) {
bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
- src_buf, src_stride, &unused, second_pred);
+ src_buf, src_stride, &mv_stats->sse, second_pred);
} else {
bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
- ref_stride, &unused);
+ ref_stride, &mv_stats->sse);
}
+ mv_stats->distortion = bestsme;
const MV sub_this_mv = get_mv_from_fullmv(this_mv);
- bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ bestsme += mv_stats->err_cost;
return bestsme;
}
@@ -803,13 +761,15 @@ static AOM_FORCE_INLINE void calc_int_cost_list(
const int br = best_mv.row;
const int bc = best_mv.col;
- cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv);
+ FULLPEL_MV_STATS mv_stats;
+ cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats);
if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
for (int i = 0; i < 4; i++) {
const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
bc + neighbors[i].col };
- cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
+ cost_list[i + 1] =
+ get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
}
} else {
for (int i = 0; i < 4; i++) {
@@ -818,7 +778,8 @@ static AOM_FORCE_INLINE void calc_int_cost_list(
if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
cost_list[i + 1] = INT_MAX;
} else {
- cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
+ cost_list[i + 1] =
+ get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
}
}
}
@@ -1055,7 +1016,8 @@ static AOM_INLINE void calc_sad_update_bestmv_with_indices(
static int pattern_search(FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
static const int search_steps[MAX_MVSEARCH_STEPS] = {
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
};
@@ -1278,7 +1240,7 @@ static int pattern_search(FULLPEL_MV start_mv,
}
}
- const int var_cost = get_mvpred_var_cost(ms_params, best_mv);
+ const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats);
return var_cost;
}
@@ -1296,61 +1258,68 @@ static int pattern_search(FULLPEL_MV start_mv,
static int hex_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
return pattern_search(start_mv, ms_params, search_step, do_init_search,
- cost_list, best_mv);
+ cost_list, best_mv, best_mv_stats);
}
static int bigdia_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
return pattern_search(start_mv, ms_params, search_step, do_init_search,
- cost_list, best_mv);
+ cost_list, best_mv, best_mv_stats);
}
static int square_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
return pattern_search(start_mv, ms_params, search_step, do_init_search,
- cost_list, best_mv);
+ cost_list, best_mv, best_mv_stats);
}
static int fast_hex_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
return hex_search(start_mv, ms_params,
AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
- cost_list, best_mv);
+ cost_list, best_mv, best_mv_stats);
}
static int vfast_dia_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
return bigdia_search(start_mv, ms_params,
AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step),
- do_init_search, cost_list, best_mv);
+ do_init_search, cost_list, best_mv, best_mv_stats);
}
static int fast_dia_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
return bigdia_search(start_mv, ms_params,
AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
- do_init_search, cost_list, best_mv);
+ do_init_search, cost_list, best_mv, best_mv_stats);
}
static int fast_bigdia_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int search_step, const int do_init_search,
- int *cost_list, FULLPEL_MV *best_mv) {
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
return bigdia_search(start_mv, ms_params,
AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
- do_init_search, cost_list, best_mv);
+ do_init_search, cost_list, best_mv, best_mv_stats);
}
static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad,
@@ -1528,7 +1497,9 @@ static INLINE unsigned int get_start_mvpred_sad_cost(
static int full_pixel_diamond(FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int step_param, int *cost_list,
- FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+ FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv) {
const search_site_config *cfg = ms_params->search_sites;
int thissme, n, num00 = 0;
@@ -1539,7 +1510,7 @@ static int full_pixel_diamond(FULLPEL_MV start_mv,
diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv,
second_best_mv);
- int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv);
+ int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats);
// If there won't be more n-step search, check to see if refining search is
// needed.
@@ -1550,14 +1521,17 @@ static int full_pixel_diamond(FULLPEL_MV start_mv,
// TODO(chiyotsai@google.com): There is another bug here where the second
// best mv gets incorrectly overwritten. Fix it later.
FULLPEL_MV tmp_best_mv;
+ FULLPEL_MV_STATS tmp_best_mv_stats;
diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n,
&num00, &tmp_best_mv, second_best_mv);
- thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv);
+ thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv,
+ &tmp_best_mv_stats);
if (thissme < bestsme) {
bestsme = thissme;
*best_mv = tmp_best_mv;
+ *best_mv_stats = tmp_best_mv_stats;
}
if (num00) {
@@ -1658,6 +1632,7 @@ static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const struct MESH_PATTERN *const mesh_patterns,
int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *mv_stats,
FULLPEL_MV *second_best_mv) {
const int kMinRange = 7;
const int kMaxRange = 256;
@@ -1717,7 +1692,7 @@ static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
}
if (bestsme < INT_MAX) {
- bestsme = get_mvpred_var_cost(ms_params, best_mv);
+ bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats);
}
// Return cost list.
@@ -1809,7 +1784,8 @@ int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
int av1_full_pixel_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int step_param, int *cost_list,
- FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+ FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv) {
const BLOCK_SIZE bsize = ms_params->bsize;
const SEARCH_METHODS search_method = ms_params->search_method;
@@ -1835,37 +1811,38 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
switch (search_method) {
case FAST_BIGDIA:
var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
- best_mv);
+ best_mv, best_mv_stats);
break;
case VFAST_DIAMOND:
var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
- best_mv);
+ best_mv, best_mv_stats);
break;
case FAST_DIAMOND:
var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
- best_mv);
+ best_mv, best_mv_stats);
break;
case FAST_HEX:
var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
- best_mv);
+ best_mv, best_mv_stats);
break;
case HEX:
- var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+ var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv,
+ best_mv_stats);
break;
case SQUARE:
- var =
- square_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+ var = square_search(start_mv, ms_params, step_param, 1, cost_list,
+ best_mv, best_mv_stats);
break;
case BIGDIA:
- var =
- bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+ var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list,
+ best_mv, best_mv_stats);
break;
case NSTEP:
case NSTEP_8PT:
case DIAMOND:
case CLAMPED_DIAMOND:
var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
- best_mv, second_best_mv);
+ best_mv, best_mv_stats, second_best_mv);
break;
default: assert(0 && "Invalid search method.");
}
@@ -1922,13 +1899,15 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
- cost_list, best_mv, second_best_mv);
+ cost_list, best_mv, best_mv_stats,
+ second_best_mv);
}
}
if (run_mesh_search) {
int var_ex;
FULLPEL_MV tmp_mv_ex;
+ FULLPEL_MV_STATS tmp_mv_stats;
// Pick the mesh pattern for exhaustive search based on the toolset (intraBC
// or non-intraBC)
// TODO(chiyotsai@google.com): There is a bug here where the second best mv
@@ -1937,10 +1916,12 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
ms_params->mesh_patterns[is_intra_mode];
// TODO(chiyotsai@google.com): the second best mv is not set correctly by
// full_pixel_exhaustive, which can incorrectly override it.
- var_ex = full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns,
- cost_list, &tmp_mv_ex, second_best_mv);
+ var_ex =
+ full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list,
+ &tmp_mv_ex, &tmp_mv_stats, second_best_mv);
if (var_ex < var) {
var = var_ex;
+ *best_mv_stats = tmp_mv_stats;
*best_mv = tmp_mv_ex;
}
}
@@ -2000,7 +1981,8 @@ int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
hash_mv.col = ref_block_hash.x - x_pos;
hash_mv.row = ref_block_hash.y - y_pos;
if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
- const int refCost = get_mvpred_var_cost(ms_params, &hash_mv);
+ FULLPEL_MV_STATS mv_stats;
+ const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats);
if (refCost < best_hash_cost) {
best_hash_cost = refCost;
*best_mv = hash_mv;
@@ -2011,12 +1993,27 @@ int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
return best_hash_cost;
}
-static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
+ int full_search, int *sad) {
int best_sad = INT_MAX;
int this_sad;
int d;
int center, offset = 0;
- int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
+ int bw = search_size << 1;
+
+ if (full_search) {
+ for (d = 0; d <= bw; d++) {
+ this_sad = aom_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+ *sad = best_sad;
+ return (center - (bw >> 1));
+ }
+
for (d = 0; d <= bw; d += 16) {
this_sad = aom_vector_var(&ref[d], src, bwl);
if (this_sad < best_sad) {
@@ -2072,31 +2069,47 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
center = this_pos;
}
}
-
+ *sad = best_sad;
return (center - (bw >> 1));
}
-// A special fast version of motion search used in rt mode
+// A special fast version of motion search used in rt mode.
+// The search window along columns and row is given by:
+// +/- me_search_size_col/row.
unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, int mi_row,
- int mi_col, const MV *ref_mv) {
+ int mi_col, const MV *ref_mv,
+ unsigned int *y_sad_zero,
+ int me_search_size_col,
+ int me_search_size_row) {
+ const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mi = xd->mi[0];
struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
- DECLARE_ALIGNED(16, int16_t, hbuf[256]);
- DECLARE_ALIGNED(16, int16_t, vbuf[256]);
- DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
- DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
int idx;
- const int bw = 4 << mi_size_wide_log2[bsize];
- const int bh = 4 << mi_size_high_log2[bsize];
- const int search_width = bw << 1;
- const int search_height = bh << 1;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ const int full_search = is_screen;
+ const bool screen_scroll_superblock =
+ is_screen && bsize == cm->seq_params->sb_size;
+ // Keep border a multiple of 16.
+ const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+ int search_size_width = me_search_size_col;
+ int search_size_height = me_search_size_row;
+ // Adjust based on boundary.
+ if (((mi_col << 2) - search_size_width < -border) ||
+ ((mi_col << 2) + search_size_width > cm->width + border))
+ search_size_width = border;
+ if (((mi_row << 2) - search_size_height < -border) ||
+ ((mi_row << 2) + search_size_height > cm->height + border))
+ search_size_height = border;
const int src_stride = x->plane[0].src.stride;
const int ref_stride = xd->plane[0].pre[0].stride;
uint8_t const *ref_buf, *src_buf;
int_mv *best_int_mv = &xd->mi[0]->mv[0];
unsigned int best_sad, tmp_sad, this_sad[4];
+ int best_sad_col, best_sad_row;
const int row_norm_factor = mi_size_high_log2[bsize] + 1;
const int col_norm_factor = 3 + (bw >> 5);
const YV12_BUFFER_CONFIG *scaled_ref_frame =
@@ -2129,13 +2142,29 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
}
return best_sad;
}
-
- // Set up prediction 1-D reference set
- ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
- aom_int_pro_row(hbuf, ref_buf, ref_stride, search_width, bh, row_norm_factor);
-
- ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
- aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, search_height,
+ const int width_ref_buf = (search_size_width << 1) + bw;
+ const int height_ref_buf = (search_size_height << 1) + bh;
+ int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf));
+ int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf));
+ int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+ int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+ if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+ aom_free(hbuf);
+ aom_free(vbuf);
+ aom_free(src_hbuf);
+ aom_free(src_vbuf);
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+ }
+
+ // Set up prediction 1-D reference set for rows.
+ ref_buf = xd->plane[0].pre[0].buf - search_size_width;
+ aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh,
+ row_norm_factor);
+
+ // Set up prediction 1-D reference set for cols
+ ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride;
+ aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf,
col_norm_factor);
// Set up src 1-D reference set
@@ -2145,9 +2174,19 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
// Find the best match per 1-D search
best_int_mv->as_fullmv.col =
- vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+ vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
+ full_search, &best_sad_col);
best_int_mv->as_fullmv.row =
- vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+ vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
+ full_search, &best_sad_row);
+
+ // For screen: select between horiz or vert motion.
+ if (is_screen) {
+ if (best_sad_col < best_sad_row)
+ best_int_mv->as_fullmv.row = 0;
+ else
+ best_int_mv->as_fullmv.col = 0;
+ }
FULLPEL_MV this_mv = best_int_mv->as_fullmv;
src_buf = x->plane[0].src.buf;
@@ -2159,16 +2198,18 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
if (best_int_mv->as_int != 0) {
tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
xd->plane[0].pre[0].buf, ref_stride);
-
+ *y_sad_zero = tmp_sad;
if (tmp_sad < best_sad) {
best_int_mv->as_fullmv = kZeroFullMv;
this_mv = best_int_mv->as_fullmv;
ref_buf = xd->plane[0].pre[0].buf;
best_sad = tmp_sad;
}
+ } else {
+ *y_sad_zero = best_sad;
}
- {
+ if (!screen_scroll_superblock) {
const uint8_t *const pos[4] = {
ref_buf - ref_stride,
ref_buf - 1,
@@ -2178,33 +2219,33 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
this_sad);
- }
- for (idx = 0; idx < 4; ++idx) {
- if (this_sad[idx] < best_sad) {
- best_sad = this_sad[idx];
- best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
- best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+ for (idx = 0; idx < 4; ++idx) {
+ if (this_sad[idx] < best_sad) {
+ best_sad = this_sad[idx];
+ best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+ best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+ }
}
- }
- if (this_sad[0] < this_sad[3])
- this_mv.row -= 1;
- else
- this_mv.row += 1;
+ if (this_sad[0] < this_sad[3])
+ this_mv.row -= 1;
+ else
+ this_mv.row += 1;
- if (this_sad[1] < this_sad[2])
- this_mv.col -= 1;
- else
- this_mv.col += 1;
+ if (this_sad[1] < this_sad[2])
+ this_mv.col -= 1;
+ else
+ this_mv.col += 1;
- ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+ ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
- tmp_sad =
- cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
- if (best_sad > tmp_sad) {
- best_int_mv->as_fullmv = this_mv;
- best_sad = tmp_sad;
+ tmp_sad =
+ cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+ if (best_sad > tmp_sad) {
+ best_int_mv->as_fullmv = this_mv;
+ best_sad = tmp_sad;
+ }
}
FullMvLimits mv_limits = x->mv_limits;
@@ -2218,6 +2259,10 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
}
+ aom_free(hbuf);
+ aom_free(vbuf);
+ aom_free(src_hbuf);
+ aom_free(src_vbuf);
return best_sad;
}
@@ -2960,8 +3005,9 @@ static AOM_INLINE int setup_center_error_facade(
int av1_find_best_sub_pixel_tree_pruned_more(
MACROBLOCKD *xd, const AV1_COMMON *const cm,
- const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
- int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
(void)cm;
const int allow_hp = ms_params->allow_hp;
const int forced_stop = ms_params->forced_stop;
@@ -2982,8 +3028,16 @@ int av1_find_best_sub_pixel_tree_pruned_more(
? &cm->sf_identity
: xd->block_ref_scale_factors[0];
const int is_scaled = av1_is_scaled(sf);
- besterr = setup_center_error_facade(
- xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ besterr =
+ setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+ sse1, distortion, is_scaled);
+ }
// If forced_stop is FULL_PEL, return.
if (forced_stop == FULL_PEL) return besterr;
@@ -3045,9 +3099,11 @@ int av1_find_best_sub_pixel_tree_pruned_more(
int av1_find_best_sub_pixel_tree_pruned(
MACROBLOCKD *xd, const AV1_COMMON *const cm,
- const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
- int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
(void)cm;
+ (void)start_mv_stats;
const int allow_hp = ms_params->allow_hp;
const int forced_stop = ms_params->forced_stop;
const int iters_per_step = ms_params->iters_per_step;
@@ -3067,8 +3123,16 @@ int av1_find_best_sub_pixel_tree_pruned(
? &cm->sf_identity
: xd->block_ref_scale_factors[0];
const int is_scaled = av1_is_scaled(sf);
- besterr = setup_center_error_facade(
- xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ besterr =
+ setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+ sse1, distortion, is_scaled);
+ }
// If forced_stop is FULL_PEL, return.
if (forced_stop == FULL_PEL) return besterr;
@@ -3181,9 +3245,12 @@ int av1_find_best_sub_pixel_tree_pruned(
int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
- MV start_mv, MV *bestmv, int *distortion,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion,
unsigned int *sse1,
int_mv *last_mv_search_list) {
+ (void)start_mv_stats;
const int allow_hp = ms_params->allow_hp;
const int forced_stop = ms_params->forced_stop;
const int iters_per_step = ms_params->iters_per_step;
@@ -3207,12 +3274,18 @@ int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
: xd->block_ref_scale_factors[0];
const int is_scaled = av1_is_scaled(sf);
- if (subpel_search_type != USE_2_TAPS_ORIG) {
- besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
- mv_cost_params, sse1, distortion);
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
} else {
- besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
- distortion);
+ if (subpel_search_type != USE_2_TAPS_ORIG) {
+ besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
+ mv_cost_params, sse1, distortion);
+ } else {
+ besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+ distortion);
+ }
}
// If forced_stop is FULL_PEL, return.
@@ -3255,12 +3328,14 @@ int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
// Returns the maximum MV.
int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
- MV start_mv, MV *bestmv, int *distortion,
- unsigned int *sse1,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion, unsigned int *sse1,
int_mv *last_mv_search_list) {
(void)xd;
(void)cm;
(void)start_mv;
+ (void)start_mv_stats;
(void)sse1;
(void)distortion;
(void)last_mv_search_list;
@@ -3282,12 +3357,14 @@ int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
// Returns the minimum MV.
int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
- MV start_mv, MV *bestmv, int *distortion,
- unsigned int *sse1,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion, unsigned int *sse1,
int_mv *last_mv_search_list) {
(void)xd;
(void)cm;
(void)start_mv;
+ (void)start_mv_stats;
(void)sse1;
(void)distortion;
(void)last_mv_search_list;
@@ -3814,9 +3891,11 @@ static AOM_FORCE_INLINE void obmc_second_level_check_v2(
int av1_find_best_obmc_sub_pixel_tree_up(
MACROBLOCKD *xd, const AV1_COMMON *const cm,
- const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
- int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
(void)last_mv_search_list;
+ (void)start_mv_stats;
const int allow_hp = ms_params->allow_hp;
const int forced_stop = ms_params->forced_stop;
const int iters_per_step = ms_params->iters_per_step;
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 6b9af07bb..87b9309b6 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -140,13 +140,19 @@ typedef struct {
aom_sad_multi_d_fn_t sdx3df;
} FULLPEL_MOTION_SEARCH_PARAMS;
+typedef struct {
+ int err_cost;
+ unsigned int distortion;
+ unsigned int sse;
+} FULLPEL_MV_STATS;
+
void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
void av1_make_default_fullpel_ms_params(
FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
- int fine_search_interval);
+ SEARCH_METHODS search_method, int fine_search_interval);
/*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */
void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -256,10 +262,10 @@ void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
int av1_init_search_range(int size);
-unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
- MACROBLOCK *x, BLOCK_SIZE bsize,
- int mi_row, int mi_col,
- const MV *ref_mv);
+unsigned int av1_int_pro_motion_estimation(
+ const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
+ int me_search_size_col, int me_search_size_row);
int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const FULLPEL_MV start_mv, FULLPEL_MV *best_mv);
@@ -267,7 +273,8 @@ int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
int av1_full_pixel_search(const FULLPEL_MV start_mv,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
const int step_param, int *cost_list,
- FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv);
+ FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv);
int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd,
const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -325,7 +332,9 @@ void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
- MV start_mv, MV *bestmv, int *distortion,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion,
unsigned int *sse1,
int_mv *last_mv_search_list);
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index b8c000b03..e7eec29dc 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -239,14 +239,16 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
// the stride of the reference frame can be different from indicated by
// MotionVectorSearchParams::search_site_cfg. When this happens, we need to
// readjust the stride.
- const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
const search_site_config *src_search_site_cfg =
av1_get_search_site_config(cpi, x, search_method);
// Further reduce the search range.
if (search_range < INT_MAX) {
const search_site_config *search_site_cfg =
- &src_search_site_cfg[search_method_lookup[cpi->sf.mv_sf.search_method]];
+ &src_search_site_cfg[search_method_lookup[search_method]];
// Max step_param is search_site_cfg->num_search_steps.
if (search_range < 1) {
step_param = search_site_cfg->num_search_steps;
@@ -259,6 +261,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
}
int cost_list[5];
+ FULLPEL_MV_STATS best_mv_stats;
int_mv second_best_mv;
best_mv->as_int = second_best_mv.as_int = INVALID_MV;
@@ -273,21 +276,23 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
for (int m = 0; m < cand_cnt; m++) {
int_mv smv = cand[m].fmv;
FULLPEL_MV this_best_mv, this_second_best_mv;
+ FULLPEL_MV_STATS this_mv_stats;
if (smv.as_int == INVALID_MV) continue;
av1_make_default_fullpel_ms_params(
&full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv,
- src_search_site_cfg, fine_search_interval);
+ src_search_site_cfg, search_method, fine_search_interval);
const int thissme =
av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param,
cond_cost_list(cpi, cost_list), &this_best_mv,
- &this_second_best_mv);
+ &this_mv_stats, &this_second_best_mv);
if (thissme < bestsme) {
bestsme = thissme;
best_mv->as_fullmv = this_best_mv;
+ best_mv_stats = this_mv_stats;
second_best_mv.as_fullmv = this_second_best_mv;
}
@@ -298,7 +303,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
case OBMC_CAUSAL:
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
&ref_mv, start_mv, src_search_site_cfg,
- fine_search_interval);
+ search_method, fine_search_interval);
bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
step_param, &best_mv->as_fullmv);
@@ -385,13 +390,13 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
switch (mbmi->motion_mode) {
case SIMPLE_TRANSLATION:
- if (cpi->sf.mv_sf.use_accurate_subpel_search) {
+ if (mv_sf->use_accurate_subpel_search) {
const int try_second = second_best_mv.as_int != INVALID_MV &&
second_best_mv.as_int != best_mv->as_int &&
- (cpi->sf.mv_sf.disable_second_mv <= 1);
+ (mv_sf->disable_second_mv <= 1);
const int best_mv_var = mv_search_params->find_fractional_mv_step(
- xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
- &x->pred_sse[ref], fractional_ms_list);
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list);
if (try_second) {
struct macroblockd_plane *p = xd->plane;
@@ -400,7 +405,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
{ p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
};
int64_t rd = INT64_MAX;
- if (!cpi->sf.mv_sf.disable_second_mv) {
+ if (!mv_sf->disable_second_mv) {
// Calculate actual rd cost.
mbmi->mv[0].as_mv = best_mv->as_mv;
av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
@@ -423,10 +428,10 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
subpel_start_mv)) {
unsigned int sse;
const int this_var = mv_search_params->find_fractional_mv_step(
- xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
- &sse, fractional_ms_list);
+ xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv,
+ &dis, &sse, fractional_ms_list);
- if (!cpi->sf.mv_sf.disable_second_mv) {
+ if (!mv_sf->disable_second_mv) {
// If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
// to choose the better MV.
mbmi->mv[0].as_mv = this_best_mv;
@@ -459,14 +464,14 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
}
} else {
mv_search_params->find_fractional_mv_step(
- xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
- &x->pred_sse[ref], NULL);
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL);
}
break;
case OBMC_CAUSAL:
- av1_find_best_obmc_sub_pixel_tree_up(xd, cm, &ms_params,
- subpel_start_mv, &best_mv->as_mv,
- &dis, &x->pred_sse[ref], NULL);
+ av1_find_best_obmc_sub_pixel_tree_up(
+ xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis,
+ &x->pred_sse[ref], NULL);
break;
default: assert(0 && "Invalid motion mode!\n");
}
@@ -621,25 +626,28 @@ int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
// Make motion search params
FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
- const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+ FULLPEL_MV_STATS best_mv_stats;
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
const search_site_config *src_search_sites =
av1_get_search_site_config(cpi, x, search_method);
// Use the mv result from the single mode as mv predictor.
const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
&ref_mv[id].as_mv, start_fullmv,
- src_search_sites,
+ src_search_sites, search_method,
/*fine_search_interval=*/0);
av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
mask_stride, id);
// Small-range full-pixel motion search.
- if (!cpi->sf.mv_sf.disable_extensive_joint_motion_search &&
+ if (!mv_sf->disable_extensive_joint_motion_search &&
mbmi->interinter_comp.type != COMPOUND_WEDGE) {
- bestsme =
- av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
- &best_mv.as_fullmv, &second_best_mv.as_fullmv);
+ bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+ &best_mv.as_fullmv, &best_mv_stats,
+ &second_best_mv.as_fullmv);
} else {
bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
&best_mv.as_fullmv);
@@ -683,15 +691,15 @@ int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
bestsme = cpi->mv_search_params.find_fractional_mv_step(
- xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+ xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL);
if (try_second) {
MV this_best_mv;
MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) {
const int thissme = cpi->mv_search_params.find_fractional_mv_step(
- xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis, &sse,
- NULL);
+ xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis,
+ &sse, NULL);
if (thissme < bestsme) {
best_mv.as_mv = this_best_mv;
bestsme = thissme;
@@ -775,14 +783,16 @@ int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
// Make motion search params
FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
- const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+ FULLPEL_MV_STATS best_mv_stats;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
const search_site_config *src_search_sites =
av1_get_search_site_config(cpi, x, search_method);
// Use the mv result from the single mode as mv predictor.
const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
&ref_mv.as_mv, start_fullmv,
- src_search_sites,
+ src_search_sites, search_method,
/*fine_search_interval=*/0);
av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
@@ -790,7 +800,7 @@ int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
// Small-range full-pixel motion search.
bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
- &best_mv.as_fullmv, NULL);
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
if (scaled_ref_frame) {
// Swap back the original buffers for subpel motion search for the 0th slot.
@@ -816,7 +826,8 @@ int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
bestsme = cpi->mv_search_params.find_fractional_mv_step(
- xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+ xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis,
+ &sse, NULL);
}
// Restore the pointer to the first unscaled prediction buffer.
@@ -954,10 +965,12 @@ int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
return tmp_rate_mv;
}
-int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
- int mi_col, BLOCK_SIZE bsize, int ref,
- FULLPEL_MV start_mv, int num_planes,
- int use_subpixel) {
+int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int ref,
+ FULLPEL_MV start_mv, int num_planes,
+ int use_subpixel, unsigned int *sse,
+ unsigned int *var) {
assert(num_planes == 1 &&
"Currently simple_motion_search only supports luma plane");
assert(!frame_is_intra_only(&cpi->common) &&
@@ -986,8 +999,9 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
MAX_MVSEARCH_STEPS - 2);
int cost_list[5];
const int ref_idx = 0;
- int var;
+ int bestsme;
int_mv best_mv;
+ FULLPEL_MV_STATS best_mv_stats;
av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
get_ref_scale_factors(cm, ref), num_planes);
@@ -1001,20 +1015,23 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
// Allow more mesh searches for screen content type on the ARF.
const int fine_search_interval = use_fine_search_interval(cpi);
FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
- const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
const search_site_config *src_search_sites =
av1_get_search_site_config(cpi, x, search_method);
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
- start_mv, src_search_sites,
+ start_mv, src_search_sites, search_method,
fine_search_interval);
- var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
- cond_cost_list(cpi, cost_list),
- &best_mv.as_fullmv, NULL);
+ bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
const int use_subpel_search =
- var < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
- use_subpixel;
+ bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+ use_subpixel &&
+ (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL);
if (scaled_ref_frame) {
xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
}
@@ -1025,50 +1042,30 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
cost_list);
// TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
- ms_params.forced_stop = cpi->sf.mv_sf.simple_motion_subpel_force_stop;
+ ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop;
MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
cpi->mv_search_params.find_fractional_mv_step(
- xd, cm, &ms_params, subpel_start_mv, &best_mv.as_mv, &not_used,
- &x->pred_sse[ref], NULL);
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv,
+ &not_used, &x->pred_sse[ref], NULL);
+
+ mbmi->mv[0] = best_mv;
+
+ // Get a copy of the prediction output
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ *var = cpi->ppi->fn_ptr[bsize].vf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, sse);
} else {
// Manually convert from units of pixel to 1/8-pixels if we are not doing
// subpel search
convert_fullmv_to_mv(&best_mv);
+ *var = best_mv_stats.distortion;
+ *sse = best_mv_stats.sse;
}
- mbmi->mv[0] = best_mv;
-
- // Get a copy of the prediction output
- av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
- AOM_PLANE_Y, AOM_PLANE_Y);
-
- if (scaled_ref_frame) {
- xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
- }
-
- return best_mv;
-}
-
-int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
- int mi_col, BLOCK_SIZE bsize,
- const FULLPEL_MV start_mv, int use_subpixel,
- unsigned int *sse, unsigned int *var) {
- MACROBLOCKD *xd = &x->e_mbd;
- const MV_REFERENCE_FRAME ref =
- cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
-
- int_mv best_mv = av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
- start_mv, 1, use_subpixel);
-
- const uint8_t *src = x->plane[0].src.buf;
- const int src_stride = x->plane[0].src.stride;
- const uint8_t *dst = xd->plane[0].dst.buf;
- const int dst_stride = xd->plane[0].dst.stride;
-
- *var = cpi->ppi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
-
return best_mv;
}
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index d2996bc52..d1fa915bc 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -59,18 +59,14 @@ int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
int *rate_mv, int ref_idx);
// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
-// ref. Note that this sets the offset of mbmi, so we will need to reset it
-// after calling this function.
-int_mv av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
- int mi_row, int mi_col, BLOCK_SIZE bsize,
- int ref, FULLPEL_MV start_mv, int num_planes,
- int use_subpixel);
-
-// Performs a simple motion search to calculate the sse and var of the residue
-int_mv av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
- int mi_row, int mi_col, BLOCK_SIZE bsize,
- const FULLPEL_MV start_mv, int use_subpixel,
- unsigned int *sse, unsigned int *var);
+// ref and calculates the sse and var of the residue. Note that this sets the
+// offset of mbmi, so we will need to reset it after calling this function.
+int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int ref,
+ const FULLPEL_MV start_mv,
+ int num_planes, int use_subpixel,
+ unsigned int *sse, unsigned int *var);
static AOM_INLINE const search_site_config *av1_get_search_site_config(
const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
@@ -101,6 +97,47 @@ static AOM_INLINE const search_site_config *av1_get_search_site_config(
return x->search_site_cfg_buf;
}
+static AOM_INLINE SEARCH_METHODS
+av1_get_faster_search_method(SEARCH_METHODS search_method) {
+ // Note on search method's accuracy:
+ // 1. NSTEP
+ // 2. DIAMOND
+ // 3. BIGDIA \approx SQUARE
+ // 4. HEX.
+ // 5. FAST_HEX \approx FAST_DIAMOND
+ switch (search_method) {
+ case NSTEP: return DIAMOND;
+ case NSTEP_8PT: return DIAMOND;
+ case DIAMOND: return BIGDIA;
+ case CLAMPED_DIAMOND: return BIGDIA;
+ case BIGDIA: return HEX;
+ case SQUARE: return HEX;
+ case HEX: return FAST_HEX;
+ case FAST_HEX: return FAST_HEX;
+ case FAST_DIAMOND: return VFAST_DIAMOND;
+ case FAST_BIGDIA: return FAST_BIGDIA;
+ case VFAST_DIAMOND: return VFAST_DIAMOND;
+ default: assert(0 && "Invalid search method!"); return DIAMOND;
+ }
+}
+
+static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method(
+ const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) {
+ SEARCH_METHODS search_method = mv_sf->search_method;
+ const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
+ const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ const bool use_faster_search_method =
+ (sf_blk_search_method == 1 && min_dim >= 32) ||
+ (sf_blk_search_method >= 2 && min_dim >= 16 &&
+ x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
+
+ if (use_faster_search_method) {
+ search_method = av1_get_faster_search_method(search_method);
+ }
+ return search_method;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/av1/encoder/nonrd_opt.h b/av1/encoder/nonrd_opt.h
index 36a0c236f..a53578eba 100644
--- a/av1/encoder/nonrd_opt.h
+++ b/av1/encoder/nonrd_opt.h
@@ -103,6 +103,8 @@ typedef struct {
int use_ref_frame_mask[REF_FRAMES];
//! Array to hold flags of evaluated modes for each reference frame
uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+ //! Array to hold flag indicating if scaled reference frame is used.
+ bool use_scaled_ref_frame[REF_FRAMES];
} InterModeSearchStateNonrd;
static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
@@ -412,30 +414,35 @@ static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
* data for the current macroblock
* \param[in] ref_frame Reference frame for which to find
* ref MVs
- * \param[in] frame_mv Predicted MVs for a block
+ * \param[out] frame_mv Predicted MVs for a block
* \param[in] yv12_mb Buffer to hold predicted block
* \param[in] bsize Current block size
* \param[in] force_skip_low_temp_var Flag indicating possible mode search
* prune for low temporal variance block
* \param[in] skip_pred_mv Flag indicating to skip av1_mv_pred
+ * \param[out] use_scaled_ref_frame Flag to indicate if scaled reference
+ * frame is used.
*
* \remark Nothing is returned. Instead, predicted MVs are placed into
- * \c frame_mv array
+ * \c frame_mv array, and use_scaled_ref_frame is set.
*/
-static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x,
- MV_REFERENCE_FRAME ref_frame,
- int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
- struct buf_2d yv12_mb[8][MAX_MB_PLANE],
- BLOCK_SIZE bsize,
- int force_skip_low_temp_var,
- int skip_pred_mv) {
+static INLINE void find_predictors(
+ AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+ struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+ int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
- const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+ const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame);
+ const bool ref_is_scaled =
+ ref->y_crop_height != cm->height || ref->y_crop_width != cm->width;
+ const YV12_BUFFER_CONFIG *scaled_ref =
+ av1_get_scaled_ref_frame(cpi, ref_frame);
+ const YV12_BUFFER_CONFIG *yv12 =
+ ref_is_scaled && scaled_ref ? scaled_ref : ref;
const int num_planes = av1_num_planes(cm);
-
x->pred_mv_sad[ref_frame] = INT_MAX;
x->pred_mv0_sad[ref_frame] = INT_MAX;
x->pred_mv1_sad[ref_frame] = INT_MAX;
@@ -443,8 +450,8 @@ static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x,
// TODO(kyslov) this needs various further optimizations. to be continued..
assert(yv12 != NULL);
if (yv12 != NULL) {
- const struct scale_factors *const sf =
- get_ref_scale_factors_const(cm, ref_frame);
+ struct scale_factors *const sf =
+ scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame);
av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
@@ -457,7 +464,7 @@ static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x,
&frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame];
// Early exit for non-LAST frame if force_skip_low_temp_var is set.
- if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 && !skip_pred_mv &&
+ if (!ref_is_scaled && bsize >= BLOCK_8X8 && !skip_pred_mv &&
!(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
bsize);
@@ -467,6 +474,7 @@ static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x,
av1_count_overlappable_neighbors(cm, xd);
}
mbmi->num_proj_ref = 1;
+ *use_scaled_ref_frame = ref_is_scaled && scaled_ref;
}
static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 24a526496..9be3237eb 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -179,8 +179,6 @@ static bool use_aggressive_subpel_search_method(MACROBLOCK *x,
* \param[in] x Pointer to structure holding all the
* data for the current macroblock
* \param[in] bsize Current block size
- * \param[in] mi_row Row index in 4x4 units
- * \param[in] mi_col Column index in 4x4 units
* \param[in] tmp_mv Pointer to best found New MV
* \param[in] rate_mv Pointer to Rate of the best new MV
* \param[in] best_rd_sofar RD Cost of the best mode found so far
@@ -192,15 +190,13 @@ static bool use_aggressive_subpel_search_method(MACROBLOCK *x,
* Rate estimation for this vector is placed to \c rate_mv
*/
static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- int_mv *tmp_mv, int *rate_mv,
- int64_t best_rd_sofar, int use_base_mv) {
+ BLOCK_SIZE bsize, int_mv *tmp_mv,
+ int *rate_mv, int64_t best_rd_sofar,
+ int use_base_mv) {
MACROBLOCKD *xd = &x->e_mbd;
const AV1_COMMON *cm = &cpi->common;
- const int num_planes = av1_num_planes(cm);
const SPEED_FEATURES *sf = &cpi->sf;
MB_MODE_INFO *mi = xd->mi[0];
- struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
int step_param = (sf->rt_sf.fullpel_search_step_param)
? sf->rt_sf.fullpel_search_step_param
: cpi->mv_search_params.mv_step_param;
@@ -212,19 +208,6 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
int rv = 0;
int cost_list[5];
int search_subpel = 1;
- const YV12_BUFFER_CONFIG *scaled_ref_frame =
- av1_get_scaled_ref_frame(cpi, ref);
-
- if (scaled_ref_frame) {
- int plane;
- // Swap out the reference frame for a version that's been scaled to
- // match the resolution of the current frame, allowing the existing
- // motion search code to be used without additional modifications.
- for (plane = 0; plane < MAX_MB_PLANE; plane++)
- backup_yv12[plane] = xd->plane[plane].pre[0];
- av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
- num_planes);
- }
start_mv = get_fullmv_from_mv(&ref_mv);
@@ -233,17 +216,19 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
else
center_mv = tmp_mv->as_mv;
- const SEARCH_METHODS search_method = sf->mv_sf.search_method;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
const search_site_config *src_search_sites =
av1_get_search_site_config(cpi, x, search_method);
FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
- start_mv, src_search_sites,
+ start_mv, src_search_sites, search_method,
/*fine_search_interval=*/0);
const unsigned int full_var_rd = av1_full_pixel_search(
start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
- &tmp_mv->as_fullmv, NULL);
+ &tmp_mv->as_fullmv, &best_mv_stats, NULL);
// calculate the bit cost on motion vector
MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
@@ -272,22 +257,17 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
// adaptively downgrade subpel search method based on block properties
if (use_aggressive_subpel_search_method(
x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well))
- av1_find_best_sub_pixel_tree_pruned_more(xd, cm, &ms_params,
- subpel_start_mv, &tmp_mv->as_mv,
- &dis, &x->pred_sse[ref], NULL);
+ av1_find_best_sub_pixel_tree_pruned_more(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+ &dis, &x->pred_sse[ref], NULL);
else
cpi->mv_search_params.find_fractional_mv_step(
- xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
- &x->pred_sse[ref], NULL);
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+ &dis, &x->pred_sse[ref], NULL);
*rate_mv =
av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
}
-
- if (scaled_ref_frame) {
- for (int plane = 0; plane < MAX_MB_PLANE; plane++)
- xd->plane[plane].pre[0] = backup_yv12[plane];
- }
// The final MV can not be equal to the reference MV as this will trigger an
// assert later. This can happen if both NEAREST and NEAR modes were skipped.
rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
@@ -331,6 +311,7 @@ static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
MB_MODE_INFO *const mi = xd->mi[0];
AV1_COMMON *cm = &cpi->common;
int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame];
+ unsigned int y_sad_zero;
if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
gf_temporal_ref) {
int tmp_sad;
@@ -338,9 +319,12 @@ static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
if (bsize < BLOCK_16X16) return -1;
+ int me_search_size_col = block_size_wide[bsize] >> 1;
+ int me_search_size_row = block_size_high[bsize] >> 1;
tmp_sad = av1_int_pro_motion_estimation(
cpi, x, bsize, mi_row, mi_col,
- &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv);
+ &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv, &y_sad_zero,
+ me_search_size_col, me_search_size_row);
if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
@@ -363,7 +347,7 @@ static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
cpi->mv_search_params.find_fractional_mv_step(
- xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
+ xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis,
&x->pred_sse[ref_frame], NULL);
this_ref_frm_newmv->as_int = best_mv.as_int;
@@ -378,9 +362,8 @@ static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
*rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv,
x->mv_costs->nmv_joint_cost,
x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
- } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
- &frame_mv[NEWMV][ref_frame], rate_mv,
- best_rdc->rdcost, 0)) {
+ } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame],
+ rate_mv, best_rdc->rdcost, 0)) {
return -1;
}
@@ -1689,7 +1672,7 @@ static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
MB_MODE_INFO *mi, int mi_row,
- int mi_col, int bsize,
+ int mi_col, BLOCK_SIZE bsize,
int gf_temporal_ref,
int use_ref_frame[],
int *force_skip_low_temp_var) {
@@ -1804,10 +1787,10 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
use_ref_frame[LAST_FRAME] = use_last_ref_frame;
- // For now keep this assert on, but we should remove it for svc mode,
- // as the user may want to generate an intra-only frame (no inter-modes).
- // Remove this assert in subsequent CL when nonrd_pickmode is tested for the
- // case of intra-only frame (no references enabled).
+ // Keep this assert on, as only 3 references are used in nonrd_pickmode
+ // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this
+ // frame must be an intra-only frame and hence should never enter the
+ // pickmode here for inter frames.
assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
}
@@ -1918,6 +1901,14 @@ static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
return 0;
}
+static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ struct buf_2d *yv12_mb) {
+ struct macroblock_plane *const p = &x->plane[0];
+ const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+ yv12_mb->buf, yv12_mb->stride);
+ if (y_sad == 0) x->block_is_zero_sad = 1;
+}
+
static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, int y_sad,
unsigned int source_variance,
@@ -1925,10 +1916,24 @@ static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
const int subsampling_x = cpi->common.seq_params->subsampling_x;
const int subsampling_y = cpi->common.seq_params->subsampling_y;
const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ const int high_res = cpi->common.width * cpi->common.height >= 640 * 360;
+ if (bsize == cpi->common.seq_params->sb_size) {
+ // At superblock level color_sensitivity is already set to 0, 1, or 2.
+ // 2 is middle/uncertain level. To avoid additional sad
+ // computations when bsize = sb_size force level 2 to 1 (certain color)
+ // for motion areas.
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] =
+ source_sad_nonrd >= kMedSad ? 1 : 0;
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] =
+ source_sad_nonrd >= kMedSad ? 1 : 0;
+ }
+ return;
+ }
int shift = 3;
- if (source_sad_nonrd >= kMedSad &&
- cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
- cpi->common.width * cpi->common.height >= 640 * 360)
+ if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res)
shift = 4;
if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
cpi->rc.high_source_sad) {
@@ -1953,8 +1958,12 @@ static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
const int num_planes = av1_num_planes(&cpi->common);
for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) {
+ // Always check if level = 2. If level = 0 check again for
+ // motion areas for higher resolns, where color artifacts
+ // are more noticeable.
if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 ||
- source_variance < 50) {
+ (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 &&
+ source_sad_nonrd >= kMedSad && high_res)) {
struct macroblock_plane *const p = &x->plane[plane];
const BLOCK_SIZE bs =
get_plane_block_size(bsize, subsampling_x, subsampling_y);
@@ -2213,9 +2222,8 @@ static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
// Function to setup parameters used for inter mode evaluation in non-rd.
static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
- RD_STATS *rd_cost, int *force_skip_low_temp_var, int *skip_pred_mv,
- int mi_row, int mi_col, int gf_temporal_ref, unsigned char segment_id,
- BLOCK_SIZE bsize
+ RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col,
+ int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize
#if CONFIG_AV1_TEMPORAL_DENOISING
,
PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode
@@ -2226,6 +2234,7 @@ static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
TxfmSearchInfo *txfm_info = &x->txfm_search_info;
MB_MODE_INFO *const mi = xd->mi[0];
const ModeCosts *mode_costs = &x->mode_costs;
+ int skip_pred_mv = 0;
// Initialize variance and distortion (chroma) for all modes and reference
// frames
@@ -2272,20 +2281,21 @@ static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
#endif
// Populate predicated motion vectors for LAST_FRAME
- if (cpi->ref_frame_flags & AOM_LAST_FLAG)
+ if (cpi->ref_frame_flags & AOM_LAST_FLAG) {
find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv,
search_state->yv12_mb, bsize, *force_skip_low_temp_var,
- x->force_zeromv_skip_for_blk);
-
+ x->force_zeromv_skip_for_blk,
+ &search_state->use_scaled_ref_frame[LAST_FRAME]);
+ }
// Update mask to use all reference frame
get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
search_state->use_ref_frame_mask,
force_skip_low_temp_var);
- *skip_pred_mv = x->force_zeromv_skip_for_blk ||
- (x->nonrd_prune_ref_frame_search > 2 &&
- x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
- x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+ skip_pred_mv = x->force_zeromv_skip_for_blk ||
+ (x->nonrd_prune_ref_frame_search > 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
// Populate predicated motion vectors for other single reference frame
// Start at LAST_FRAME + 1.
@@ -2294,7 +2304,8 @@ static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
if (search_state->use_ref_frame_mask[ref_frame_iter]) {
find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv,
search_state->yv12_mb, bsize, *force_skip_low_temp_var,
- *skip_pred_mv);
+ skip_pred_mv,
+ &search_state->use_scaled_ref_frame[ref_frame_iter]);
}
}
}
@@ -2334,6 +2345,27 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
*ref_frame2 = NONE_FRAME;
}
+ if (x->sb_me_block && *ref_frame == LAST_FRAME) {
+ // We want to make sure to test the superblock MV:
+ // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
+ // have this sb MV. And don't skip NEWMV_LAST: this will be set to
+ // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't
+ // have it.
+ if (*this_mode == NEARESTMV &&
+ search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int ==
+ x->sb_me_mv.as_int) {
+ return false;
+ }
+ if (*this_mode == NEARMV &&
+ search_state->frame_mv[NEARMV][LAST_FRAME].as_int ==
+ x->sb_me_mv.as_int) {
+ return false;
+ }
+ if (*this_mode == NEWMV) {
+ return false;
+ }
+ }
+
// Skip the single reference mode for which mode check flag is set.
if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) {
return true;
@@ -2397,13 +2429,12 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame))
return true;
- // For screen content: for base spatial layer only for now.
- if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
- cpi->svc.spatial_layer_id == 0) {
+ // For screen content: skip mode testing based on source_sad.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
// If source_sad is computed: skip non-zero motion
// check for stationary (super)blocks. Otherwise if superblock
- // has motion skip the modes with zero motion for flat blocks,
- // and color is not set.
+ // has motion skip the modes with zero motion on last reference
+ // for flat blocks, and color is not set.
// For the latter condition: the same condition should apply
// to newmv if (0, 0), so this latter condition is repeated
// below after search_new_mv.
@@ -2411,9 +2442,9 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 &&
x->content_state_sb.source_sad_nonrd == kZeroSad) ||
(search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 &&
- x->content_state_sb.source_sad_nonrd != kZeroSad &&
- ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
- x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+ x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME &&
+ ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
cpi->rc.high_source_sad) &&
x->source_variance == 0))
return true;
@@ -2479,10 +2510,11 @@ static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
#if CONFIG_AV1_TEMPORAL_DENOISING
int64_t *zero_last_cost_orig, int denoise_svc_pickmode,
#endif
- int idx, int force_mv_inter_layer, int is_single_pred, int skip_pred_mv,
- int gf_temporal_ref, int use_model_yrd_large, int filter_search_enabled_blk,
- BLOCK_SIZE bsize, PREDICTION_MODE this_mode, InterpFilter filt_select,
- int cb_pred_filter_search, int reuse_inter_pred) {
+ int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref,
+ int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize,
+ PREDICTION_MODE this_mode, InterpFilter filt_select,
+ int cb_pred_filter_search, int reuse_inter_pred,
+ int *sb_me_has_been_tested) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mi = xd->mi[0];
@@ -2511,7 +2543,10 @@ static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
RD_STATS nonskip_rdc;
av1_invalid_rd_stats(&nonskip_rdc);
- if (this_mode == NEWMV && !force_mv_inter_layer) {
+ if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) {
+ // Set the NEWMV_LAST to the sb MV.
+ search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int;
+ } else if (this_mode == NEWMV && !force_mv_inter_layer) {
#if COLLECT_NONRD_PICK_MODE_STAT
aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
#endif
@@ -2552,13 +2587,13 @@ static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
if (skip_this_mv && is_single_pred) return true;
// For screen: for spatially flat blocks with non-zero motion,
- // skip newmv if the motion vector is (0, 0), and color is not set.
+ // skip newmv if the motion vector is (0, 0)-LAST, and color is not set.
if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) {
- if (this_mv->as_int == 0 &&
- x->content_state_sb.source_sad_nonrd != kZeroSad &&
- ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
- x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+ if (this_mv->as_int == 0 && ref_frame == LAST_FRAME &&
+ x->block_is_zero_sad == 0 &&
+ ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
cpi->rc.high_source_sad) &&
x->source_variance == 0)
return true;
@@ -2581,32 +2616,6 @@ static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
}
}
- if (idx == 0 && !skip_pred_mv) {
- // Set color sensitivity on first tested mode only.
- // Use y-sad already computed in find_predictors: take the sad with motion
- // vector closest to 0; the uv-sad computed below in set_color_sensitivity
- // is for zeromv.
- // For screen: first check if golden reference is being used, if so,
- // force color_sensitivity on if the color sensitivity for sb_g is on.
- if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
- search_state->use_ref_frame_mask[GOLDEN_FRAME]) {
- if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1)
- x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1;
- if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
- x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1;
- } else {
- int y_sad = x->pred_mv0_sad[LAST_FRAME];
- if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
- (abs(search_state->frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
- abs(search_state->frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
- (abs(search_state->frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
- abs(search_state->frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
- y_sad = x->pred_mv1_sad[LAST_FRAME];
- set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
- search_state->yv12_mb[LAST_FRAME]);
- }
- }
-
mi->motion_mode = SIMPLE_TRANSLATION;
#if !CONFIG_REALTIME_ONLY
if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
@@ -2786,6 +2795,8 @@ static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
// Compute sse for chroma planes.
const int64_t sse_uv = av1_model_rd_for_sb_uv(
cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V);
+ if (rdc_uv.dist < x->min_dist_inter_uv)
+ x->min_dist_inter_uv = rdc_uv.dist;
search_state->this_rdc.sse += sse_uv;
// Restore Y rdc if UV rdc disallows txfm skip
if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
@@ -2875,6 +2886,11 @@ static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
#endif
+ if (x->sb_me_block && ref_frame == LAST_FRAME &&
+ search_state->frame_mv[this_best_mode][ref_frame].as_int ==
+ x->sb_me_mv.as_int)
+ *sb_me_has_been_tested = 1;
+
// Copy best mode params to search state
if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
search_state->best_rdc = search_state->this_rdc;
@@ -2900,7 +2916,7 @@ static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) {
txfm_info->skip_txfm = 1;
- return false;
+ if (!x->sb_me_block || *sb_me_has_been_tested) return false;
}
return true;
}
@@ -2960,6 +2976,8 @@ static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd(
}
av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U,
AOM_PLANE_V);
+ if (rdc_uv.dist < x->min_dist_inter_uv)
+ x->min_dist_inter_uv = rdc_uv.dist;
idtx_rdc.rate += rdc_uv.rate;
idtx_rdc.dist += rdc_uv.dist;
idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm;
@@ -3071,7 +3089,6 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
int best_early_term = 0;
int force_skip_low_temp_var = 0;
unsigned int sse_zeromv_norm = UINT_MAX;
- int skip_pred_mv = 0;
const int num_inter_modes = NUM_INTER_MODES;
const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
bool check_globalmv = rt_sf->check_globalmv_on_single_ref;
@@ -3082,6 +3099,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8;
InterModeSearchStateNonrd search_state;
av1_zero(search_state.use_ref_frame_mask);
+ av1_zero(search_state.use_scaled_ref_frame);
BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode;
(void)tile_data;
@@ -3111,7 +3129,9 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
const int resize_pending = is_frame_resize_pending(cpi);
#endif
const ModeCosts *mode_costs = &x->mode_costs;
-
+ struct scale_factors sf_no_scale;
+ av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+ cm->width, cm->height);
if (reuse_inter_pred) {
for (int buf_idx = 0; buf_idx < 3; buf_idx++) {
tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx];
@@ -3130,7 +3150,9 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
// to source, so use subpel motion vector to compensate. The nonzero motion
// is half pixel shifted to left and top, so (-4, -4). This has more effect
// on higher resolutions, so condition it on that for now.
+ // Exclude quality layers, which have the same resolution and hence no shift.
if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+ !svc->has_lower_quality_layer &&
svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
cm->width * cm->height > 640 * 480) {
svc_mv.as_mv.row = -4;
@@ -3138,12 +3160,12 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
}
// Setup parameters used for inter mode evaluation.
- set_params_nonrd_pick_inter_mode(
- cpi, x, &search_state, rd_cost, &force_skip_low_temp_var, &skip_pred_mv,
- mi_row, mi_col, gf_temporal_ref, segment_id, bsize
+ set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost,
+ &force_skip_low_temp_var, mi_row, mi_col,
+ gf_temporal_ref, segment_id, bsize
#if CONFIG_AV1_TEMPORAL_DENOISING
- ,
- ctx, denoise_svc_pickmode
+ ,
+ ctx, denoise_svc_pickmode
#endif
);
@@ -3207,6 +3229,28 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
inter_pred_params_sr.conv_params =
get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd);
+ x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ !x->force_zeromv_skip_for_blk &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad &&
+ x->source_variance == 0 && bsize < cm->seq_params->sb_size &&
+ search_state.yv12_mb[LAST_FRAME][0].width == cm->width &&
+ search_state.yv12_mb[LAST_FRAME][0].height == cm->height) {
+ set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]);
+ }
+
+ int sb_me_has_been_tested = 0;
+ x->sb_me_block = x->sb_me_partition;
+ // Only use this feature (force testing of superblock motion) if coding
+ // block size is large.
+ if (x->sb_me_block) {
+ if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64)
+ x->sb_me_block = 0;
+ else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32)
+ x->sb_me_block = 0;
+ }
+
+ x->min_dist_inter_uv = INT64_MAX;
for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
// If we are at the first compound mode, and the single modes already
// perform well, then end the search.
@@ -3218,6 +3262,36 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
int is_single_pred = 1;
PREDICTION_MODE this_mode;
+ if (idx == 0 && !x->force_zeromv_skip_for_blk) {
+ // Set color sensitivity on first tested mode only.
+ // Use y-sad already computed in find_predictors: take the sad with motion
+ // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+ // is for zeromv.
+ // For screen: first check if golden reference is being used, if so,
+ // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1.
+ // The check in set_color_sensitivity() will then follow and check for
+ // setting the flag if the level is still 2 or 0.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ search_state.use_ref_frame_mask[GOLDEN_FRAME]) {
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1)
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1;
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1;
+ }
+ if (search_state.use_ref_frame_mask[LAST_FRAME] &&
+ x->pred_mv0_sad[LAST_FRAME] != INT_MAX) {
+ int y_sad = x->pred_mv0_sad[LAST_FRAME];
+ if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+ (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+ abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+ (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+ abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+ y_sad = x->pred_mv1_sad[LAST_FRAME];
+ set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
+ search_state.yv12_mb[LAST_FRAME]);
+ }
+ }
+
// Check the inter mode can be skipped based on mode statistics and speed
// features settings.
if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred,
@@ -3239,6 +3313,16 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
mi->ref_frame[1] = ref_frame2;
set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
+ // Check if the scaled reference frame should be used. This is set in the
+ // find_predictors() for each usable reference. If so, set the
+ // block_ref_scale_factors[] to no reference scaling.
+ if (search_state.use_scaled_ref_frame[ref_frame]) {
+ xd->block_ref_scale_factors[0] = &sf_no_scale;
+ }
+ if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) {
+ xd->block_ref_scale_factors[1] = &sf_no_scale;
+ }
+
// Perform inter mode evaluation for non-rd
if (!handle_inter_mode_nonrd(
cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer,
@@ -3247,10 +3331,10 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
#if CONFIG_AV1_TEMPORAL_DENOISING
&zero_last_cost_orig, denoise_svc_pickmode,
#endif
- idx, force_mv_inter_layer, is_single_pred, skip_pred_mv,
- gf_temporal_ref, use_model_yrd_large, filter_search_enabled_blk,
- bsize, this_mode, filt_select, cb_pred_filter_search,
- reuse_inter_pred)) {
+ idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref,
+ use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode,
+ filt_select, cb_pred_filter_search, reuse_inter_pred,
+ &sb_me_has_been_tested)) {
break;
}
}
@@ -3292,8 +3376,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
x->content_state_sb.source_sad_nonrd != kZeroSad &&
bsize <= BLOCK_16X16) {
- unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 250000;
- unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 1000;
+ unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000;
+ unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200;
unsigned int best_sse_inter_motion =
(unsigned int)(search_state.best_rdc.sse >>
(b_width_log2_lookup[bsize] +
@@ -3324,7 +3408,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
try_palette &&
(is_mode_intra(best_pickmode->best_mode) || force_palette_test) &&
x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
- (cpi->rc.high_source_sad || x->source_variance > 500);
+ (cpi->rc.high_source_sad || x->source_variance > 300);
if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0;
@@ -3360,6 +3444,14 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
if (!is_inter_block(mi)) {
mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
+ } else {
+ // If inter mode is selected and ref_frame was one that uses the
+ // scaled reference frame, then we can't use reuse_inter_pred.
+ if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] ||
+ (has_second_ref(mi) &&
+ search_state
+ .use_scaled_ref_frame[best_pickmode->best_second_ref_frame]))
+ x->reuse_inter_pred = 0;
}
// Restore the predicted samples of best mode to final buffer
@@ -3425,4 +3517,9 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
#endif // COLLECT_NONRD_PICK_MODE_STAT
*rd_cost = search_state.best_rdc;
+
+ // Reset the xd->block_ref_scale_factors[i], as they may have
+ // been set to pointer &sf_no_scale, which becomes invalid afer
+ // this function.
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
}
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index b1a73e465..7f79e9596 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -564,7 +564,24 @@ void av1_rd_pick_palette_intra_sby(
}
uint8_t *const color_map = xd->plane[0].color_index_map;
- if (colors_threshold > 1 && colors_threshold <= 64) {
+ int color_thresh_palette = 64;
+ // Allow for larger color_threshold for palette search, based on color,
+ // scene_change, and block source variance.
+ // Since palette is Y based, only allow larger threshold if block
+ // color_dist is below threshold.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad &&
+ x->source_variance > 50) {
+ int64_t norm_color_dist = 0;
+ if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+ norm_color_dist = x->min_dist_inter_uv >>
+ (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ if (x->color_sensitivity[0] && x->color_sensitivity[1])
+ norm_color_dist = norm_color_dist >> 1;
+ }
+ if (norm_color_dist < 8000) color_thresh_palette += 20;
+ }
+ if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) {
int16_t *const data = x->palette_buffer->kmeans_data_buf;
int16_t centroids[PALETTE_MAX_SIZE];
int lower_bound, upper_bound;
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 96567dd48..1e3d9804c 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -40,7 +40,6 @@
#endif
#define COLLECT_MOTION_SEARCH_FEATURE_SB 0
-#define ML_PARTITION_WHOLE_TREE_DECISION 0
void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
part_sf->partition_search_type = SEARCH_PARTITION;
@@ -73,6 +72,7 @@ void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
part_sf->intra_cnn_based_part_prune_level = 0;
part_sf->ext_partition_eval_thresh = BLOCK_8X8;
part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+ part_sf->ext_part_eval_based_on_cur_best = 0;
part_sf->prune_ext_part_using_split_info = 0;
part_sf->prune_rectangular_split_based_on_qidx = 0;
part_sf->early_term_after_none_split = 0;
@@ -1772,6 +1772,9 @@ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
if (pc_tree->none == NULL) {
pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
}
PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
@@ -1832,6 +1835,9 @@ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
pc_tree->split[i]->index = i;
}
switch (partition) {
@@ -1848,6 +1854,9 @@ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
pc_tree->horizontal[i] =
av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->horizontal[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
}
pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
PARTITION_HORZ, subsize, pc_tree->horizontal[0],
@@ -1881,6 +1890,9 @@ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
pc_tree->vertical[i] =
av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->vertical[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
}
pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
@@ -1981,6 +1993,9 @@ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
if (pc_tree->split[i]->none == NULL)
pc_tree->split[i]->none =
av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none,
invalid_rdc);
@@ -2264,10 +2279,13 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
x->force_zeromv_skip_for_blk =
get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
- if (!x->force_zeromv_skip_for_blk) {
+ // Source variance may be already compute at superblock level, so no need
+ // to recompute, unless bsize < sb_size or source_variance is not yet set.
+ if (!x->force_zeromv_skip_for_blk &&
+ (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size))
x->source_variance = av1_get_perpixel_variance_facade(
cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
- }
+
// Save rdmult before it might be changed, so it can be restored later.
const int orig_rdmult = x->rdmult;
setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
@@ -2396,6 +2414,9 @@ static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td,
av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
if (!pc_tree->none) {
pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
} else {
av1_reset_pmc(pc_tree->none);
}
@@ -2416,6 +2437,9 @@ static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td,
for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
if (!pc_tree->split[i]) {
pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
}
pc_tree->split[i]->index = i;
}
@@ -2434,6 +2458,9 @@ static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td,
if (!pc_tree->split[i]->none) {
pc_tree->split[i]->none =
av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
} else {
av1_reset_pmc(pc_tree->split[i]->none);
}
@@ -2568,6 +2595,9 @@ static void try_merge(AV1_COMP *const cpi, ThreadData *td,
pc_tree->partitioning = PARTITION_NONE;
if (!pc_tree->none) {
pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
} else {
av1_reset_pmc(pc_tree->none);
}
@@ -2599,6 +2629,9 @@ static void try_merge(AV1_COMP *const cpi, ThreadData *td,
if (!pc_tree->split[i]->none) {
pc_tree->split[i]->none =
av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
} else {
av1_reset_pmc(pc_tree->split[i]->none);
}
@@ -2659,6 +2692,9 @@ static void try_merge(AV1_COMP *const cpi, ThreadData *td,
if (!pc_tree->split[i]->none) {
pc_tree->split[i]->none =
av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
}
encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
@@ -2762,6 +2798,7 @@ static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
int force_skip_low_temp_var = 0;
int skip_pred_mv = 0;
+ bool use_scaled_ref;
for (int i = 0; i < MB_MODE_COUNT; ++i) {
for (int j = 0; j < REF_FRAMES; ++j) {
@@ -2774,7 +2811,7 @@ static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize,
- force_skip_low_temp_var, skip_pred_mv);
+ force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
int continue_merging = 1;
if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row ||
@@ -2790,7 +2827,7 @@ static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
mi_col, this_mi[0]->bsize);
find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize,
- force_skip_low_temp_var, skip_pred_mv);
+ force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
} else {
struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
const int is_scaled = av1_is_scaled(sf);
@@ -2945,6 +2982,9 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
case PARTITION_NONE:
if (!pc_tree->none) {
pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
} else {
av1_reset_pmc(pc_tree->none);
}
@@ -2958,6 +2998,9 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (!pc_tree->vertical[i]) {
pc_tree->vertical[i] =
av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->vertical[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
} else {
av1_reset_pmc(pc_tree->vertical[i]);
}
@@ -2978,6 +3021,9 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (!pc_tree->horizontal[i]) {
pc_tree->horizontal[i] =
av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->horizontal[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
} else {
av1_reset_pmc(pc_tree->horizontal[i]);
}
@@ -2998,6 +3044,9 @@ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
if (!pc_tree->split[i]) {
pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
}
pc_tree->split[i]->index = i;
}
@@ -3494,6 +3543,9 @@ static void rectangular_partition_search(
if (cur_ctx[i][j][0] == NULL) {
cur_ctx[i][j][0] =
av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf);
+ if (!cur_ctx[i][j][0])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
}
}
sum_rdc->rate = part_search_state->partition_cost[partition_type];
@@ -3573,7 +3625,7 @@ static void rd_pick_ab_part(
PartitionBlkParams blk_params = part_search_state->part_blk_params;
const int mi_row = blk_params.mi_row;
const int mi_col = blk_params.mi_col;
- const int bsize = blk_params.bsize;
+ const BLOCK_SIZE bsize = blk_params.bsize;
int64_t this_rdcost = 0;
#if CONFIG_COLLECT_PARTITION_STATS
@@ -3683,7 +3735,7 @@ static void ab_partitions_search(
PartitionBlkParams blk_params = part_search_state->part_blk_params;
const int mi_row = blk_params.mi_row;
const int mi_col = blk_params.mi_col;
- const int bsize = blk_params.bsize;
+ const BLOCK_SIZE bsize = blk_params.bsize;
if (part_search_state->terminate_partition_search) {
return;
@@ -3759,6 +3811,9 @@ static void ab_partitions_search(
// Set AB partition context.
cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+ if (!cur_part_ctxs[ab_part_type][i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
// Set mode as not ready.
cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
}
@@ -3818,8 +3873,12 @@ static void set_4_part_ctx_and_rdcost(
part_search_state->partition_cost[partition_type];
part_search_state->sum_rdc.rdcost =
RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
- for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i)
+ for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!cur_part_ctx[i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
}
// Partition search of HORZ4 / VERT4 partition types.
@@ -3882,6 +3941,50 @@ static void rd_pick_4partition(
blk_params.bsize, av1_num_planes(cm));
}
+// Do not evaluate extended partitions if NONE partition is skippable.
+static INLINE int prune_ext_part_none_skippable(
+ PICK_MODE_CONTEXT *part_none, int must_find_valid_partition,
+ int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) {
+ if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) {
+ if (part_none->skippable && !must_find_valid_partition &&
+ bsize >= BLOCK_16X16) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+// Allow ab partition search
+static int allow_ab_partition_search(PartitionSearchState *part_search_state,
+ PARTITION_SPEED_FEATURES *part_sf,
+ PARTITION_TYPE curr_best_part,
+ int must_find_valid_partition,
+ int prune_ext_part_state,
+ int64_t best_rdcost) {
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ // Do not prune if there is no valid partition
+ if (best_rdcost == INT64_MAX) return 1;
+
+ // Determine bsize threshold to evaluate ab partitions
+ BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh;
+ if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition &&
+ !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT))
+ ab_bsize_thresh = BLOCK_128X128;
+
+ // ab partitions are only allowed for square block sizes BLOCK_16X16 or
+ // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and
+ // BLOCK_8X8.
+ assert(ab_bsize_thresh >= BLOCK_8X8);
+
+ int ab_partition_allowed =
+ part_search_state->do_rectangular_split && bsize > ab_bsize_thresh &&
+ av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+ return ab_partition_allowed;
+}
+
// Prune 4-way partitions based on the number of horz/vert wins
// in the current block and sub-blocks in PARTITION_SPLIT.
static void prune_4_partition_using_split_info(
@@ -3915,9 +4018,28 @@ static void prune_4_partition_using_split_info(
static void prune_4_way_partition_search(
AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
PartitionSearchState *part_search_state, RD_STATS *best_rdc,
- int pb_source_variance, int ext_partition_allowed,
+ int pb_source_variance, int prune_ext_part_state,
int part4_search_allowed[NUM_PART4_TYPES]) {
- PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ // Do not prune if there is no valid partition
+ if (best_rdc->rdcost == INT64_MAX) return;
+
+ // Determine bsize threshold to evaluate 4-way partitions
+ BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh;
+ if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best &&
+ !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE)
+ part4_bsize_thresh = BLOCK_128X128;
+
+ // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and
+ // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude
+ // BLOCK_4X4 and BLOCK_8X8.
+ assert(part4_bsize_thresh >= BLOCK_8X8);
+
+ bool partition4_allowed =
+ part_search_state->do_rectangular_split && bsize > part4_bsize_thresh &&
+ av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
// Disable 4-way partition search flags for width less than a multiple of the
// minimum partition width.
@@ -3928,17 +4050,15 @@ static void prune_4_way_partition_search(
return;
}
- const int bsize = blk_params.bsize;
PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
PARTITION_VERT_4 };
const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
// partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
// PARTITION_VERT_4 for this block. This is almost the same as
- // ext_partition_allowed, except that we don't allow 128x32 or 32x128
+ // partition4_allowed, except that we don't allow 128x32 or 32x128
// blocks, so we require that bsize is not BLOCK_128X128.
- const int partition4_allowed = part_cfg->enable_1to4_partitions &&
- ext_partition_allowed &&
- bsize != BLOCK_128X128;
+ partition4_allowed &=
+ part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128;
for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) {
part4_search_allowed[i] =
@@ -3988,6 +4108,9 @@ static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td,
// Set PARTITION_NONE context.
if (pc_tree->none == NULL)
pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
// Set PARTITION_NONE type cost.
if (part_search_state->partition_none_allowed) {
@@ -4215,7 +4338,7 @@ static void split_partition_search(
const CommonModeInfoParams *const mi_params = &cm->mi_params;
const int mi_row = blk_params.mi_row;
const int mi_col = blk_params.mi_col;
- const int bsize = blk_params.bsize;
+ const BLOCK_SIZE bsize = blk_params.bsize;
assert(bsize < BLOCK_SIZES_ALL);
RD_STATS sum_rdc = part_search_state->sum_rdc;
const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
@@ -4228,6 +4351,9 @@ static void split_partition_search(
for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
if (pc_tree->split[i] == NULL)
pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
pc_tree->split[i]->index = i;
}
@@ -4400,6 +4526,7 @@ static void write_partition_tree(AV1_COMP *const cpi,
fclose(pfile);
}
+#if CONFIG_PARTITION_SEARCH_ORDER
static void verify_write_partition_tree(const AV1_COMP *const cpi,
const PC_TREE *const pc_tree,
const BLOCK_SIZE bsize,
@@ -4463,6 +4590,7 @@ static void verify_write_partition_tree(const AV1_COMP *const cpi,
}
static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
+ struct aom_internal_error_info *error_info,
const int config_id) {
const AV1_COMMON *const cm = &cpi->common;
const char *path = cpi->oxcf.partition_info_path;
@@ -4502,6 +4630,9 @@ static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
for (int i = 0; i < 4; ++i) {
if (node != NULL) { // Suppress warning
node->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!node->split[i])
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
node->split[i]->index = i;
tree_node_queue[last_idx] = node->split[i];
++last_idx;
@@ -4665,7 +4796,8 @@ static void update_partition_stats(const RD_STATS *const this_rdcost,
static void build_pc_tree_from_part_decision(
const aom_partition_decision_t *partition_decision,
- const BLOCK_SIZE this_bsize, PC_TREE *pc_tree) {
+ const BLOCK_SIZE this_bsize, PC_TREE *pc_tree,
+ struct aom_internal_error_info *error_info) {
BLOCK_SIZE bsize = this_bsize;
int num_nodes = partition_decision->num_nodes;
PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
@@ -4686,6 +4818,9 @@ static void build_pc_tree_from_part_decision(
for (int i = 0; i < 4; ++i) {
if (node != NULL) { // Suppress warning
node->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!node->split[i])
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
node->split[i]->index = i;
tree_node_queue[last_idx] = node->split[i];
++last_idx;
@@ -4707,6 +4842,7 @@ static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ struct aom_internal_error_info *error_info = x->e_mbd.error_info;
aom_partition_features_t features;
prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
&features);
@@ -4716,7 +4852,6 @@ static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
features.frame_height = cpi->frame_info.frame_height;
features.block_size = bsize;
av1_ext_part_send_features(ext_part_controller, &features);
- PC_TREE *pc_tree;
// rd mode search (dry run) for a valid partition decision from the ml model.
aom_partition_decision_t partition_decision;
@@ -4728,26 +4863,32 @@ static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
// First, let's take the easy approach.
// We require that the ml model has to provide partition decisions for the
// whole superblock.
- pc_tree = av1_alloc_pc_tree_node(bsize);
- build_pc_tree_from_part_decision(&partition_decision, bsize, pc_tree);
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root,
+ error_info);
const RD_STATS this_rdcost = rd_search_for_fixed_partition(
- cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
aom_partition_stats_t stats;
update_partition_stats(&this_rdcost, &stats);
av1_ext_part_send_partition_stats(ext_part_controller, &stats);
if (!partition_decision.is_final_decision) {
- av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
}
} while (!partition_decision.is_final_decision);
// Encode with the selected mode and partition.
set_cb_offsets(x->cb_offset, 0, 0);
encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
- pc_tree, NULL);
- av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
return true;
}
@@ -4978,6 +5119,9 @@ static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td,
av1_init_rd_stats(&split_rdc[i]);
if (pc_tree->split[i] == NULL)
pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
pc_tree->split[i]->index = i;
}
const int orig_rdmult_tmp = x->rdmult;
@@ -5050,12 +5194,14 @@ static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td,
features.frame_height = cpi->frame_info.frame_height;
features.block_size = bsize;
av1_ext_part_send_features(ext_part_controller, &features);
- PC_TREE *pc_tree;
- pc_tree = av1_alloc_pc_tree_node(bsize);
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
RD_STATS rdcost;
const bool valid_partition =
- recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree, mi_row,
+ recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row,
mi_col, bsize, &rdcost);
if (!valid_partition) {
return false;
@@ -5064,9 +5210,10 @@ static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td,
// Encode with the selected mode and partition.
set_cb_offsets(x->cb_offset, 0, 0);
encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
- pc_tree, NULL);
- av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
return true;
}
@@ -5100,54 +5247,65 @@ bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
}
MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
int best_idx = 0;
int64_t min_rdcost = INT64_MAX;
int num_configs;
- RD_STATS *rdcost = NULL;
int i = 0;
do {
- PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
- num_configs = read_partition_tree(cpi, pc_tree, i);
- if (i == 0) {
- CHECK_MEM_ERROR(cm, rdcost, aom_calloc(num_configs, sizeof(*rdcost)));
- }
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i);
if (num_configs <= 0) {
- av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
cpi->sf.part_sf.partition_search_type);
- if (rdcost != NULL) aom_free(rdcost);
- aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid configs.");
+ td->pc_root = NULL;
+ aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs.");
+ }
+ verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col);
+ if (i == 0) {
+ AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost,
+ aom_calloc(num_configs, sizeof(*x->rdcost)));
}
- verify_write_partition_tree(cpi, pc_tree, bsize, i, mi_row, mi_col);
// Encode the block with the given partition tree. Get rdcost and encoding
// time.
- rdcost[i] = rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root,
- mi_row, mi_col, bsize, pc_tree);
+ x->rdcost[i] = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
- if (rdcost[i].rdcost < min_rdcost) {
- min_rdcost = rdcost[i].rdcost;
+ if (x->rdcost[i].rdcost < min_rdcost) {
+ min_rdcost = x->rdcost[i].rdcost;
best_idx = i;
- *best_rd_cost = rdcost[i];
+ *best_rd_cost = x->rdcost[i];
}
- av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
++i;
} while (i < num_configs);
+ aom_free(x->rdcost);
+ x->rdcost = NULL;
// Encode with the partition configuration with the smallest rdcost.
- PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
- read_partition_tree(cpi, pc_tree, best_idx);
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx);
rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row,
- mi_col, bsize, pc_tree);
+ mi_col, bsize, td->pc_root);
set_cb_offsets(x->cb_offset, 0, 0);
encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
- pc_tree, NULL);
- av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
cpi->sf.part_sf.partition_search_type);
- aom_free(rdcost);
+ td->pc_root = NULL;
++cpi->sb_counter;
return true;
}
+#endif // CONFIG_PARTITION_SEARCH_ORDER
static AOM_INLINE bool should_do_dry_run_encode_for_current_block(
BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index,
@@ -5493,25 +5651,21 @@ BEGIN_PARTITION_SEARCH:
assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
!part_search_state.do_rectangular_split));
- int ext_partition_allowed =
- part_search_state.do_rectangular_split &&
- bsize > cpi->sf.part_sf.ext_partition_eval_thresh &&
- av1_blk_has_rows_and_cols(&blk_params);
+ const int prune_ext_part_state = prune_ext_part_none_skippable(
+ pc_tree->none, x->must_find_valid_partition,
+ cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize);
+
+ const int ab_partition_allowed = allow_ab_partition_search(
+ &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning,
+ x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost);
- // Do not evaluate extended partitions if NONE partition is skippable.
- if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 1) &&
- (pc_tree->none != NULL)) {
- if (pc_tree->none->skippable && !x->must_find_valid_partition &&
- bsize >= BLOCK_16X16)
- ext_partition_allowed = 0;
- }
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, ab_partitions_search_time);
#endif
// AB partitions search stage.
ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
&part_search_state, &best_rdc, rect_part_win_info,
- pb_source_variance, ext_partition_allowed, HORZ_A,
+ pb_source_variance, ab_partition_allowed, HORZ_A,
VERT_B);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, ab_partitions_search_time);
@@ -5521,7 +5675,7 @@ BEGIN_PARTITION_SEARCH:
int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
// Prune 4-way partition search.
prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
- pb_source_variance, ext_partition_allowed,
+ pb_source_variance, prune_ext_part_state,
part4_search_allowed);
#if CONFIG_COLLECT_COMPONENT_TIMING
@@ -5618,9 +5772,12 @@ BEGIN_PARTITION_SEARCH:
set_cb_offsets(x->cb_offset, 0, 0);
encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
pc_tree, NULL);
+ assert(pc_tree == td->pc_root);
// Dealloc the whole PC_TREE after a superblock is done.
av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0,
cpi->sf.part_sf.partition_search_type);
+ pc_tree = NULL;
+ td->pc_root = NULL;
pc_tree_dealloc = 1;
} else if (should_do_dry_run_encode_for_current_block(
cm->seq_params->sb_size, x->sb_enc.max_partition_size,
@@ -5941,6 +6098,9 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
// PARTITION_NONE
if (partition_none_allowed) {
pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
PICK_MODE_CONTEXT *ctx = pc_tree->none;
// Flip for RDO based pick mode
@@ -5974,6 +6134,9 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
pc_tree->split[i]->index = i;
}
diff --git a/av1/encoder/partition_search.h b/av1/encoder/partition_search.h
index 2577e79f1..1b5d71b7d 100644
--- a/av1/encoder/partition_search.h
+++ b/av1/encoder/partition_search.h
@@ -42,11 +42,14 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf);
void av1_reset_sf_for_ext_part(AV1_COMP *const cpi);
+#if CONFIG_PARTITION_SEARCH_ORDER
bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TokenExtra **tp,
SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
int mi_col, BLOCK_SIZE bsize,
RD_STATS *best_rd_cost);
+#endif
+
bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 080587b3c..ce0631357 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -108,7 +108,7 @@ static void write_features_to_file(const char *const path,
const bool is_test_mode,
const float *features,
const int feature_size, const int id,
- const int bsize, const int mi_row,
+ const BLOCK_SIZE bsize, const int mi_row,
const int mi_col) {
if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
@@ -118,7 +118,8 @@ static void write_features_to_file(const char *const path,
FILE *pfile = fopen(filename, "a");
if (pfile == NULL) return;
if (!is_test_mode) {
- fprintf(pfile, "%d,%d,%d,%d,%d\n", id, bsize, mi_row, mi_col, feature_size);
+ fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col,
+ feature_size);
}
for (int i = 0; i < feature_size; ++i) {
fprintf(pfile, "%.6f", features[i]);
@@ -203,7 +204,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
cnn_config, &thread_data,
bit_depth, &output)) {
- aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
"Error allocating CNN data");
return;
}
@@ -212,7 +213,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
if (!av1_cnn_predict_img_multi_out(image, width, height, stride,
cnn_config, &thread_data, &output)) {
- aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
"Error allocating CNN data");
return;
}
@@ -471,8 +472,6 @@ static int simple_motion_search_get_best_ref(
// Otherwise do loop through the reference frames and find the one with the
// minimum SSE
- const MACROBLOCKD *xd = &x->e_mbd;
-
const int num_planes = 1;
*best_sse = INT_MAX;
@@ -483,12 +482,9 @@ static int simple_motion_search_get_best_ref(
if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
const FULLPEL_MV *start_mvs = sms_tree->start_mvs;
unsigned int curr_sse = 0, curr_var = 0;
- int_mv best_mv =
- av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
- start_mvs[ref], num_planes, use_subpixel);
- curr_var = cpi->ppi->fn_ptr[bsize].vf(
- x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
- xd->plane[0].dst.stride, &curr_sse);
+ const int_mv best_mv = av1_simple_motion_search_sse_var(
+ cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes,
+ use_subpixel, &curr_sse, &curr_var);
if (curr_sse < *best_sse) {
*best_sse = curr_sse;
*best_var = curr_var;
@@ -840,8 +836,11 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
unsigned int sse = 0;
unsigned int var = 0;
const FULLPEL_MV start_mv = kZeroFullMv;
- int_mv best_mv = av1_simple_motion_sse_var(
- cpi, x, this_mi_row, this_mi_col, mb_size, start_mv, 0, &sse, &var);
+ const MV_REFERENCE_FRAME ref =
+ cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+ const int_mv best_mv = av1_simple_motion_search_sse_var(
+ cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse,
+ &var);
const float mv_row = (float)(best_mv.as_mv.row / 8);
const float mv_col = (float)(best_mv.as_mv.col / 8);
@@ -1214,7 +1213,7 @@ void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
const PartitionBlkParams blk_params = part_state->part_blk_params;
const int mi_row = blk_params.mi_row;
const int mi_col = blk_params.mi_col;
- const int bsize = blk_params.bsize;
+ const BLOCK_SIZE bsize = blk_params.bsize;
if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
const NN_CONFIG *nn_config = NULL;
@@ -1317,7 +1316,7 @@ void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
const PartitionBlkParams blk_params = part_state->part_blk_params;
const int mi_row = blk_params.mi_row;
const int mi_col = blk_params.mi_col;
- const int bsize = blk_params.bsize;
+ const BLOCK_SIZE bsize = blk_params.bsize;
int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd;
int64_t *split_rd = part_state->split_rd;
@@ -1331,6 +1330,7 @@ void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
int64_t *horz_rd = rect_part_rd[HORZ4];
int64_t *vert_rd = rect_part_rd[VERT4];
const NN_CONFIG *nn_config = NULL;
+ // 4-way partitions are only allowed for these three square block sizes.
switch (bsize) {
case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
@@ -1377,6 +1377,10 @@ void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
{
BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+ assert(horz_4_bs != BLOCK_INVALID);
+ assert(vert_4_bs != BLOCK_INVALID);
+
av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
av1_num_planes(&cpi->common), bsize);
const int src_stride = x->plane[0].src.stride;
@@ -1650,17 +1654,15 @@ void av1_prune_partitions_before_search(AV1_COMP *const cpi,
if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) {
const MACROBLOCKD *const xd = &x->e_mbd;
- int prune_sub_8x8 = 1;
- if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 1) {
- int num_neighbors_lt_8x8 = 0;
- if (xd->left_available)
- num_neighbors_lt_8x8 += (xd->left_mbmi->bsize <= BLOCK_8X8);
- if (xd->up_available)
- num_neighbors_lt_8x8 += (xd->above_mbmi->bsize <= BLOCK_8X8);
- // Evaluate only if both left and above blocks are of size <= BLOCK_8X8.
- if (num_neighbors_lt_8x8 == 2) {
- prune_sub_8x8 = 0;
- }
+ int prune_sub_8x8;
+ if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) {
+ prune_sub_8x8 = 1;
+ } else {
+ assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1);
+ // Prune if both neighbors are available and either is > BLOCK_8X8
+ prune_sub_8x8 = xd->left_available && xd->up_available &&
+ (xd->left_mbmi->bsize > BLOCK_8X8 ||
+ xd->above_mbmi->bsize > BLOCK_8X8);
}
if (prune_sub_8x8) {
av1_disable_all_splits(part_state);
@@ -1962,12 +1964,19 @@ static void prepare_features_after_part_ab(
features->after_part_ab.f[feature_index++] = rd_ratio;
}
+ // 4-way partitions are only allowed for these three square block sizes.
+ assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64);
+
// Get variance of the 1:4 and 4:1 sub-blocks.
unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
{
BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+ assert(horz_4_bs != BLOCK_INVALID);
+ assert(vert_4_bs != BLOCK_INVALID);
+
av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
av1_num_planes(&cpi->common), bsize);
const int src_stride = x->plane[0].src.stride;
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 46bc6b0ec..d85440df5 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -181,7 +181,7 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
// Based on recent history adjust expectations of bits per macroblock.
double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0);
double rate_err_factor = 1.0;
- const double adj_limit = AOMMAX(0.20, (double)(100 - rate_err_tol) / 200.0);
+ const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0);
const double min_fac = 1.0 - adj_limit;
const double max_fac = 1.0 + adj_limit;
@@ -255,7 +255,6 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
rate_err_factor = 1.0 - ((double)(bits_off_target) /
AOMMAX(total_actual_bits, bits_left));
}
- rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
// Adjustment is damped if this is 1 pass with look ahead processing
// (as there are only ever a few frames of data) and for all but the first
@@ -263,6 +262,7 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
}
+ rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
}
// Is the rate control trending in the right direction. Only make
@@ -270,7 +270,12 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
if ((rate_err_factor < 1.0 && err_estimate >= 0) ||
(rate_err_factor > 1.0 && err_estimate <= 0)) {
twopass->bpm_factor *= rate_err_factor;
- twopass->bpm_factor = AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+ if (rate_err_tol >= 100) {
+ twopass->bpm_factor =
+ AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+ } else {
+ twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor));
+ }
}
}
@@ -994,10 +999,9 @@ static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
GF_GROUP_STATS *gf_stats) {
RATE_CONTROL *const rc = &cpi->rc;
TWO_PASS *const twopass = &cpi->ppi->twopass;
- InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
+ AV1_COMMON *const cm = &cpi->common;
// Motion breakout threshold for loop below depends on image size.
- const double mv_ratio_accumulator_thresh =
- (initial_dimensions->height + initial_dimensions->width) / 4.0;
+ const double mv_ratio_accumulator_thresh = (cm->height + cm->width) / 4.0;
if (!flash_detected) {
// Break clause to detect very still sections after motion. For example,
@@ -1748,22 +1752,42 @@ static void cleanup_blendings(REGIONS *regions, int *num_regions) {
cleanup_regions(regions, num_regions);
}
-void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
- int total_frames, int offset, REGIONS *regions,
- int *total_regions) {
+static void free_firstpass_stats_buffers(REGIONS *temp_regions,
+ double *filt_intra_err,
+ double *filt_coded_err,
+ double *grad_coded) {
+ aom_free(temp_regions);
+ aom_free(filt_intra_err);
+ aom_free(filt_coded_err);
+ aom_free(grad_coded);
+}
+
+// Identify stable and unstable regions from first pass stats.
+// stats_start points to the first frame to analyze.
+// |offset| is the offset from the current frame to the frame stats_start is
+// pointing to.
+// Returns 0 on success, -1 on memory allocation failure.
+static int identify_regions(const FIRSTPASS_STATS *const stats_start,
+ int total_frames, int offset, REGIONS *regions,
+ int *total_regions) {
int k;
- if (total_frames <= 1) return;
+ if (total_frames <= 1) return 0;
// store the initial decisions
REGIONS *temp_regions =
(REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0]));
- av1_zero_array(temp_regions, total_frames);
// buffers for filtered stats
double *filt_intra_err =
(double *)aom_calloc(total_frames, sizeof(*filt_intra_err));
double *filt_coded_err =
(double *)aom_calloc(total_frames, sizeof(*filt_coded_err));
double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded));
+ if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) {
+ free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+ grad_coded);
+ return -1;
+ }
+ av1_zero_array(temp_regions, total_frames);
int cur_region = 0, this_start = 0, this_last;
@@ -1853,10 +1877,9 @@ void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
regions[k].last += offset;
}
- aom_free(temp_regions);
- aom_free(filt_coded_err);
- aom_free(filt_intra_err);
- aom_free(grad_coded);
+ free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+ grad_coded);
+ return 0;
}
static int find_regions_index(const REGIONS *regions, int num_regions,
@@ -3794,6 +3817,7 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
(rc->frames_since_key == 0)));
p_rc->frames_till_regions_update = rest_frames;
+ int ret;
if (cpi->ppi->lap_enabled) {
av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
twopass->stats_buf_ctx->stats_in_end);
@@ -3801,14 +3825,18 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
twopass->stats_buf_ctx->stats_in_end);
av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
twopass->stats_buf_ctx->stats_in_end);
- av1_identify_regions(cpi->twopass_frame.stats_in, rest_frames,
- (rc->frames_since_key == 0), p_rc->regions,
- &p_rc->num_regions);
+ ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames,
+ (rc->frames_since_key == 0), p_rc->regions,
+ &p_rc->num_regions);
} else {
- av1_identify_regions(
+ ret = identify_regions(
cpi->twopass_frame.stats_in - (rc->frames_since_key == 0),
rest_frames, 0, p_rc->regions, &p_rc->num_regions);
}
+ if (ret == -1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers in identify_regions");
+ }
}
int cur_region_idx =
diff --git a/av1/encoder/pass2_strategy.h b/av1/encoder/pass2_strategy.h
index e34454e94..ff1591ceb 100644
--- a/av1/encoder/pass2_strategy.h
+++ b/av1/encoder/pass2_strategy.h
@@ -134,14 +134,6 @@ int av1_calc_arf_boost(const TWO_PASS *twopass,
int *num_fpstats_used, int *num_fpstats_required,
int project_gfu_boost);
-// Identify stable and unstable regions from first pass stats.
-// stats_start points to the first frame to analyze.
-// |offset| is the offset from the current frame to the frame stats_start is
-// pointing to.
-void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
- int total_frames, int offset, REGIONS *regions,
- int *total_regions);
-
void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
FIRSTPASS_STATS *last_stats);
void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 293dafa8a..232a2f9ed 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -515,8 +515,12 @@ static INLINE uint64_t get_filt_error(
// fbc: Column index in units of 64x64 block
// Returns:
// Nothing will be returned. Contents of cdef_search_ctx will be modified.
-void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
- int sb_count) {
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info,
+ int fbr, int fbc, int sb_count) {
+ // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+ // in future to handle error propagation.
+ (void)error_info;
const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params;
const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
const int coeff_shift = cdef_search_ctx->coeff_shift;
@@ -614,14 +618,15 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
// CDEF search context.
// Returns:
// Nothing will be returned. Contents of cdef_search_ctx will be modified.
-static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx) {
+static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info) {
// Loop over each sb.
for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
// Checks if cdef processing can be skipped for particular sb.
if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue;
// Calculate mse for each sb and store the relevant sb index.
- av1_cdef_mse_calc_block(cdef_search_ctx, fbr, fbc,
+ av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc,
cdef_search_ctx->sb_count);
cdef_search_ctx->sb_count++;
}
@@ -634,24 +639,17 @@ static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx) {
// related to CDEF search context.
// Returns:
// Nothing will be returned. Contents of cdef_search_ctx will be modified.
-static AOM_INLINE bool cdef_alloc_data(CdefSearchCtx *cdef_search_ctx) {
+static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) {
const int nvfb = cdef_search_ctx->nvfb;
const int nhfb = cdef_search_ctx->nhfb;
- cdef_search_ctx->sb_index =
- aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0]));
+ CHECK_MEM_ERROR(
+ cm, cdef_search_ctx->sb_index,
+ aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0])));
cdef_search_ctx->sb_count = 0;
- cdef_search_ctx->mse[0] =
- aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
- cdef_search_ctx->mse[1] =
- aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
- if (!(cdef_search_ctx->sb_index && cdef_search_ctx->mse[0] &&
- cdef_search_ctx->mse[1])) {
- aom_free(cdef_search_ctx->sb_index);
- aom_free(cdef_search_ctx->mse[0]);
- aom_free(cdef_search_ctx->mse[1]);
- return false;
- }
- return true;
+ CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0],
+ aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+ CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1],
+ aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
}
// Deallocates the memory allocated for members of CdefSearchCtx.
@@ -660,10 +658,15 @@ static AOM_INLINE bool cdef_alloc_data(CdefSearchCtx *cdef_search_ctx) {
// related to CDEF search context.
// Returns:
// Nothing will be returned.
-static AOM_INLINE void cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
- aom_free(cdef_search_ctx->mse[0]);
- aom_free(cdef_search_ctx->mse[1]);
- aom_free(cdef_search_ctx->sb_index);
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
+ if (cdef_search_ctx) {
+ aom_free(cdef_search_ctx->mse[0]);
+ cdef_search_ctx->mse[0] = NULL;
+ aom_free(cdef_search_ctx->mse[1]);
+ cdef_search_ctx->mse[1] = NULL;
+ aom_free(cdef_search_ctx->sb_index);
+ cdef_search_ctx->sb_index = NULL;
+ }
}
// Initialize the parameters related to CDEF search context.
@@ -818,14 +821,12 @@ void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
}
}
-void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
- const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
- MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
- int skip_cdef_feature, CDEF_CONTROL cdef_control,
- const int is_screen_content, int non_reference_frame,
- int rtc_ext_rc) {
+void av1_cdef_search(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control;
+
assert(cdef_control != CDEF_NONE);
- if (cdef_control == CDEF_REFERENCE && non_reference_frame) {
+ if (cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) {
CdefInfo *const cdef_info = &cm->cdef_info;
cdef_info->nb_cdef_strengths = 1;
cdef_info->cdef_bits = 0;
@@ -834,12 +835,21 @@ void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
return;
}
+ // Indicate if external RC is used for testing
+ const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl;
if (rtc_ext_rc) {
av1_pick_cdef_from_qp(cm, 0, 0);
return;
}
+ CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method;
if (pick_method == CDEF_PICK_FROM_Q) {
- av1_pick_cdef_from_qp(cm, skip_cdef_feature, is_screen_content);
+ const int use_screen_content_model =
+ cm->quant_params.base_qindex >
+ AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh,
+ cpi->rc.best_quality + 5) &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb,
+ use_screen_content_model);
return;
}
const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -847,33 +857,33 @@ void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
pick_method <= CDEF_FAST_SEARCH_LVL5);
const int num_planes = av1_num_planes(cm);
- CdefSearchCtx cdef_search_ctx;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+ if (!cpi->cdef_search_ctx)
+ CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx,
+ aom_malloc(sizeof(*cpi->cdef_search_ctx)));
+ CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx;
+
// Initialize parameters related to CDEF search context.
- cdef_params_init(frame, ref, cm, xd, &cdef_search_ctx, pick_method);
+ cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx,
+ pick_method);
// Allocate CDEF search context buffers.
- if (!cdef_alloc_data(&cdef_search_ctx)) {
- CdefInfo *const cdef_info = &cm->cdef_info;
- cdef_info->nb_cdef_strengths = 0;
- cdef_info->cdef_bits = 0;
- cdef_info->cdef_strengths[0] = 0;
- cdef_info->cdef_uv_strengths[0] = 0;
- return;
- }
+ cdef_alloc_data(cm, cdef_search_ctx);
// Frame level mse calculation.
- if (mt_info->num_workers > 1) {
- av1_cdef_mse_calc_frame_mt(cm, mt_info, &cdef_search_ctx);
+ if (cpi->mt_info.num_workers > 1) {
+ av1_cdef_mse_calc_frame_mt(cpi);
} else {
- cdef_mse_calc_frame(&cdef_search_ctx);
+ cdef_mse_calc_frame(cdef_search_ctx, cm->error);
}
/* Search for different number of signaling bits. */
int nb_strength_bits = 0;
uint64_t best_rd = UINT64_MAX;
CdefInfo *const cdef_info = &cm->cdef_info;
- int sb_count = cdef_search_ctx.sb_count;
+ int sb_count = cdef_search_ctx->sb_count;
uint64_t(*mse[2])[TOTAL_STRENGTHS];
- mse[0] = cdef_search_ctx.mse[0];
- mse[1] = cdef_search_ctx.mse[1];
+ mse[0] = cdef_search_ctx->mse[0];
+ mse[1] = cdef_search_ctx->mse[1];
/* Calculate the maximum number of bits required to signal CDEF strengths at
* block level */
const int total_strengths = nb_cdef_strengths[pick_method];
@@ -881,6 +891,7 @@ void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
num_planes > 1 ? total_strengths * total_strengths : total_strengths;
const int max_signaling_bits =
joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1;
+ int rdmult = cpi->td.mb.rdmult;
for (int i = 0; i <= 3; i++) {
if (i > max_signaling_bits) break;
int best_lev0[CDEF_MAX_STRENGTHS];
@@ -925,7 +936,7 @@ void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
best_mse = curr;
}
}
- mi_params->mi_grid_base[cdef_search_ctx.sb_index[i]]->cdef_strength =
+ mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength =
best_gi;
}
if (fast) {
@@ -943,5 +954,5 @@ void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
cdef_info->cdef_damping = damping;
// Deallocate CDEF search context buffers.
- cdef_dealloc_data(&cdef_search_ctx);
+ av1_cdef_dealloc_data(cdef_search_ctx);
}
diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index bdd8233c8..192e734fb 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h
@@ -213,8 +213,11 @@ static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
return 0;
}
-void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
- int sb_count);
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx);
+
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info,
+ int fbr, int fbc, int sb_count);
/*!\endcond */
/*!\brief AV1 CDEF parameter search
@@ -223,19 +226,7 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
*
* Searches for optimal CDEF parameters for frame
*
- * \param[in] mt_info Pointer to multi-threading parameters
- * \param[in] frame Compressed frame buffer
- * \param[in] ref Source frame buffer
- * \param[in,out] cm Pointer to top level common structure
- * \param[in] xd Pointer to common current coding block structure
- * \param[in] pick_method The method used to select params
- * \param[in] rdmult rd multiplier to use in making param choices
- * \param[in] skip_cdef_feature Speed feature to skip cdef
- * \param[in] cdef_control Parameter that controls CDEF application
- * \param[in] is_screen_content Whether it is screen content type
- * \param[in] non_reference_frame Indicates if current frame is
- * non-reference
- * \param[in] rtc_ext_rc Indicate if external RC is used for testing
+ * \param[in,out] cpi Top level encoder structure
*
* \remark Nothing is returned. Instead, optimal CDEF parameters are stored
* in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
@@ -248,13 +239,7 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
* \arg \c damping_factor: CDEF damping factor.
*
*/
-void av1_cdef_search(struct MultiThreadInfo *mt_info,
- const YV12_BUFFER_CONFIG *frame,
- const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
- MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
- int skip_cdef_feature, CDEF_CONTROL cdef_control,
- const int is_screen_content, int non_reference_frame,
- int rtc_ext_rc);
+void av1_cdef_search(struct AV1_COMP *cpi);
/*!\brief AV1 CDEF level from QP
*
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 72124699d..642906417 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -52,6 +52,11 @@ static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = {
{ 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 }
};
+#if DEBUG_LR_COSTING
+RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+ [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif // DEBUG_LR_COSTING
+
typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
const YV12_BUFFER_CONFIG *b);
typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
@@ -100,31 +105,14 @@ static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
}
typedef struct {
- // The best coefficients for Wiener or Sgrproj restoration
- WienerInfo wiener;
- SgrprojInfo sgrproj;
-
- // The sum of squared errors for this rtype.
- int64_t sse[RESTORE_SWITCHABLE_TYPES];
-
- // The rtype to use for this unit given a frame rtype as
- // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
- RestorationType best_rtype[RESTORE_TYPES - 1];
-
- // This flag will be set based on the speed feature
- // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
- uint8_t skip_sgr_eval;
-} RestUnitSearchInfo;
-
-typedef struct {
const YV12_BUFFER_CONFIG *src;
YV12_BUFFER_CONFIG *dst;
const AV1_COMMON *cm;
const MACROBLOCK *x;
int plane;
- int plane_width;
- int plane_height;
+ int plane_w;
+ int plane_h;
RestUnitSearchInfo *rusi;
// Speed features
@@ -135,16 +123,32 @@ typedef struct {
const uint8_t *src_buffer;
int src_stride;
- // sse and bits are initialised by reset_rsc in search_rest_type
- int64_t sse;
- int64_t bits;
- int tile_y0, tile_stripe0;
+ // SSE values for each restoration mode for the current RU
+ // These are saved by each search function for use in search_switchable()
+ int64_t sse[RESTORE_SWITCHABLE_TYPES];
- // sgrproj and wiener are initialised by rsc_on_tile when starting the first
- // tile in the frame.
- SgrprojInfo sgrproj;
- WienerInfo wiener;
- PixelRect tile_rect;
+ // This flag will be set based on the speed feature
+ // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
+ uint8_t skip_sgr_eval;
+
+ // Total rate and distortion so far for each restoration type
+ // These are initialised by reset_rsc in search_rest_type
+ int64_t total_sse[RESTORE_TYPES];
+ int64_t total_bits[RESTORE_TYPES];
+
+ // Reference parameters for delta-coding
+ //
+ // For each restoration type, we need to store the latest parameter set which
+ // has been used, so that we can properly cost up the next parameter set.
+ // Note that we have two sets of these - one for the single-restoration-mode
+ // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ)
+ // and one for the switchable mode. This is because these two cases can lead
+ // to different sets of parameters being signaled, but we don't know which
+ // we will pick for sure until the end of the search process.
+ WienerInfo ref_wiener;
+ SgrprojInfo ref_sgrproj;
+ WienerInfo switchable_ref_wiener;
+ SgrprojInfo switchable_ref_sgrproj;
// Buffers used to hold dgd-avg and src-avg data respectively during SIMD
// call of Wiener filter.
@@ -154,14 +158,15 @@ typedef struct {
static AOM_INLINE void rsc_on_tile(void *priv) {
RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
- set_default_sgrproj(&rsc->sgrproj);
- set_default_wiener(&rsc->wiener);
- rsc->tile_stripe0 = 0;
+ set_default_wiener(&rsc->ref_wiener);
+ set_default_sgrproj(&rsc->ref_sgrproj);
+ set_default_wiener(&rsc->switchable_ref_wiener);
+ set_default_sgrproj(&rsc->switchable_ref_sgrproj);
}
static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
- rsc->sse = 0;
- rsc->bits = 0;
+ memset(rsc->total_sse, 0, sizeof(rsc->total_sse));
+ memset(rsc->total_bits, 0, sizeof(rsc->total_bits));
}
static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
@@ -179,20 +184,23 @@ static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
const int is_uv = plane != AOM_PLANE_Y;
- rsc->plane_width = src->crop_widths[is_uv];
- rsc->plane_height = src->crop_heights[is_uv];
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ assert(plane_w == src->crop_widths[is_uv]);
+ assert(plane_h == src->crop_heights[is_uv]);
+ assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+ assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+
+ rsc->plane_w = plane_w;
+ rsc->plane_h = plane_h;
rsc->src_buffer = src->buffers[plane];
rsc->src_stride = src->strides[is_uv];
rsc->dgd_buffer = dgd->buffers[plane];
rsc->dgd_stride = dgd->strides[is_uv];
- rsc->tile_rect = av1_whole_frame_rect(cm, is_uv);
- assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
- assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
}
static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
const RestorationTileLimits *limits,
- const PixelRect *tile_rect,
const RestorationUnitInfo *rui) {
const AV1_COMMON *const cm = rsc->cm;
const int plane = rsc->plane;
@@ -208,11 +216,11 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
const int optimized_lr = 0;
av1_loop_restoration_filter_unit(
- limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
+ limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h,
is_uv && cm->seq_params->subsampling_x,
is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
- rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+ rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error);
return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
}
@@ -746,7 +754,8 @@ static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
int width, int height, int dat_stride,
int use_highbd, int bit_depth, int pu_width,
int pu_height, int32_t *flt0, int32_t *flt1,
- int flt_stride) {
+ int flt_stride,
+ struct aom_internal_error_info *error_info) {
for (int i = 0; i < height; i += pu_height) {
const int h = AOMMIN(pu_height, height - i);
int32_t *flt0_row = flt0 + i * flt_stride;
@@ -756,11 +765,13 @@ static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
// Iterate over the stripe in blocks of width pu_width
for (int j = 0; j < width; j += pu_width) {
const int w = AOMMIN(pu_width, width - j);
- const int ret = av1_selfguided_restoration(
- dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
- flt_stride, sgr_params_idx, bit_depth, use_highbd);
- (void)ret;
- assert(!ret);
+ if (av1_selfguided_restoration(
+ dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+ flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) {
+ aom_internal_error(
+ error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffer in av1_selfguided_restoration");
+ }
}
}
}
@@ -770,10 +781,11 @@ static AOM_INLINE void compute_sgrproj_err(
const int dat_stride, const uint8_t *src8, const int src_stride,
const int use_highbitdepth, const int bit_depth, const int pu_width,
const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
- const int flt_stride, int *exqd, int64_t *err) {
+ const int flt_stride, int *exqd, int64_t *err,
+ struct aom_internal_error_info *error_info) {
int exq[2];
apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
- pu_width, pu_height, flt0, flt1, flt_stride);
+ pu_width, pu_height, flt0, flt1, flt_stride, error_info);
const sgr_params_type *const params = &av1_sgr_params[ep];
get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
@@ -798,7 +810,8 @@ static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
static SgrprojInfo search_selfguided_restoration(
const uint8_t *dat8, int width, int height, int dat_stride,
const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
- int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning) {
+ int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning,
+ struct aom_internal_error_info *error_info) {
int32_t *flt0 = rstbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
int ep, idx, bestep = 0;
@@ -814,7 +827,7 @@ static SgrprojInfo search_selfguided_restoration(
int64_t err;
compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
use_highbitdepth, bit_depth, pu_width, pu_height, ep,
- flt0, flt1, flt_stride, exqd, &err);
+ flt0, flt1, flt_stride, exqd, &err, error_info);
get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
}
} else {
@@ -824,7 +837,7 @@ static SgrprojInfo search_selfguided_restoration(
int64_t err;
compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
use_highbitdepth, bit_depth, pu_width, pu_height, ep,
- flt0, flt1, flt_stride, exqd, &err);
+ flt0, flt1, flt_stride, exqd, &err, error_info);
get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
}
// evaluate left and right ep of winner in seed ep
@@ -835,7 +848,7 @@ static SgrprojInfo search_selfguided_restoration(
int64_t err;
compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
use_highbitdepth, bit_depth, pu_width, pu_height, ep,
- flt0, flt1, flt_stride, exqd, &err);
+ flt0, flt1, flt_stride, exqd, &err, error_info);
get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
}
// evaluate last two group
@@ -844,7 +857,7 @@ static SgrprojInfo search_selfguided_restoration(
int64_t err;
compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
use_highbitdepth, bit_depth, pu_width, pu_height, ep,
- flt0, flt1, flt_stride, exqd, &err);
+ flt0, flt1, flt_stride, exqd, &err, error_info);
get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
}
}
@@ -873,10 +886,10 @@ static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
return bits;
}
-static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
- const PixelRect *tile, int rest_unit_idx,
- void *priv, int32_t *tmpbuf,
- RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_sgrproj(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
(void)rlbs;
RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
@@ -888,11 +901,11 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
// Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
- if (rusi->skip_sgr_eval) {
- rsc->bits += bits_none;
- rsc->sse += rusi->sse[RESTORE_NONE];
+ if (rsc->skip_sgr_eval) {
+ rsc->total_bits[RESTORE_SGRPROJ] += bits_none;
+ rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE];
rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE;
- rusi->sse[RESTORE_SGRPROJ] = INT64_MAX;
+ rsc->sse[RESTORE_SGRPROJ] = INT64_MAX;
return;
}
@@ -911,21 +924,22 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
dgd_start, limits->h_end - limits->h_start,
limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
- tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning);
+ tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info);
RestorationUnitInfo rui;
rui.restoration_type = RESTORE_SGRPROJ;
rui.sgrproj_info = rusi->sgrproj;
- rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
+ rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui);
- const int64_t bits_sgr = x->mode_costs.sgrproj_restore_cost[1] +
- (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
- << AV1_PROB_COST_SHIFT);
+ const int64_t bits_sgr =
+ x->mode_costs.sgrproj_restore_cost[1] +
+ (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj)
+ << AV1_PROB_COST_SHIFT);
double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
- x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE], bit_depth);
+ x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth);
double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST(
- x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ], bit_depth);
+ x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth);
if (rusi->sgrproj.ep < 10)
cost_sgr *=
(1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
@@ -934,9 +948,16 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
(cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
- rsc->sse += rusi->sse[rtype];
- rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none;
- if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info =
+ rsc->ref_sgrproj;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype];
+ rsc->total_bits[RESTORE_SGRPROJ] +=
+ (cost_sgr < cost_none) ? bits_sgr : bits_none;
+ if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj;
}
static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src,
@@ -1455,13 +1476,11 @@ static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
return bits;
}
-static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
- const RestorationTileLimits *limits,
- const PixelRect *tile,
- RestorationUnitInfo *rui,
- int wiener_win) {
+static int64_t finer_search_wiener(const RestSearchCtxt *rsc,
+ const RestorationTileLimits *limits,
+ RestorationUnitInfo *rui, int wiener_win) {
const int plane_off = (WIENER_WIN - wiener_win) >> 1;
- int64_t err = try_restoration_unit(rsc, limits, tile, rui);
+ int64_t err = try_restoration_unit(rsc, limits, rui);
if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err;
@@ -1484,7 +1503,7 @@ static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
plane_wiener->hfilter[p] -= s;
plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
- err2 = try_restoration_unit(rsc, limits, tile, rui);
+ err2 = try_restoration_unit(rsc, limits, rui);
if (err2 > err) {
plane_wiener->hfilter[p] += s;
plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
@@ -1504,7 +1523,7 @@ static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
plane_wiener->hfilter[p] += s;
plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
- err2 = try_restoration_unit(rsc, limits, tile, rui);
+ err2 = try_restoration_unit(rsc, limits, rui);
if (err2 > err) {
plane_wiener->hfilter[p] -= s;
plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
@@ -1525,7 +1544,7 @@ static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
plane_wiener->vfilter[p] -= s;
plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
- err2 = try_restoration_unit(rsc, limits, tile, rui);
+ err2 = try_restoration_unit(rsc, limits, rui);
if (err2 > err) {
plane_wiener->vfilter[p] += s;
plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
@@ -1545,7 +1564,7 @@ static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
plane_wiener->vfilter[p] += s;
plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
- err2 = try_restoration_unit(rsc, limits, tile, rui);
+ err2 = try_restoration_unit(rsc, limits, rui);
if (err2 > err) {
plane_wiener->vfilter[p] -= s;
plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
@@ -1564,13 +1583,13 @@ static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
return err;
}
-static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
- const PixelRect *tile_rect,
- int rest_unit_idx, void *priv,
- int32_t *tmpbuf,
- RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_wiener(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
(void)tmpbuf;
(void)rlbs;
+ (void)error_info;
RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
@@ -1592,13 +1611,13 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
// Do not perform Wiener search if source variance is lower than threshold
// or if the reconstruction error is zero
- int prune_wiener = (src_var < thresh) || (rusi->sse[RESTORE_NONE] == 0);
+ int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0);
if (prune_wiener) {
- rsc->bits += bits_none;
- rsc->sse += rusi->sse[RESTORE_NONE];
+ rsc->total_bits[RESTORE_WIENER] += bits_none;
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
- rusi->sse[RESTORE_WIENER] = INT64_MAX;
- if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+ rsc->sse[RESTORE_WIENER] = INT64_MAX;
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
return;
}
}
@@ -1654,16 +1673,16 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
// reduction in the function, the filter is reverted back to identity
if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter,
rui.wiener_info.hfilter) > 0) {
- rsc->bits += bits_none;
- rsc->sse += rusi->sse[RESTORE_NONE];
+ rsc->total_bits[RESTORE_WIENER] += bits_none;
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
- rusi->sse[RESTORE_WIENER] = INT64_MAX;
- if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+ rsc->sse[RESTORE_WIENER] = INT64_MAX;
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
return;
}
- rusi->sse[RESTORE_WIENER] = finer_tile_search_wiener(
- rsc, limits, tile_rect, &rui, reduced_wiener_win);
+ rsc->sse[RESTORE_WIENER] =
+ finer_search_wiener(rsc, limits, &rui, reduced_wiener_win);
rusi->wiener = rui.wiener_info;
if (reduced_wiener_win != WIENER_WIN) {
@@ -1675,14 +1694,14 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
const int64_t bits_wiener =
x->mode_costs.wiener_restore_cost[1] +
- (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
+ (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener)
<< AV1_PROB_COST_SHIFT);
double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
- x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE],
+ x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE],
rsc->cm->seq_params->bit_depth);
double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
- x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER],
+ x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER],
rsc->cm->seq_params->bit_depth);
RestorationType rtype =
@@ -1692,44 +1711,49 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
// Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
// RESTORE_NONE or based on best_rtype
if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) {
- rusi->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
+ rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
} else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) {
- rusi->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
+ rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
}
- rsc->sse += rusi->sse[rtype];
- rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
- if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info =
+ rsc->ref_wiener;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype];
+ rsc->total_bits[RESTORE_WIENER] +=
+ (cost_wiener < cost_none) ? bits_wiener : bits_none;
+ if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener;
}
-static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
- const PixelRect *tile_rect,
- int rest_unit_idx, void *priv,
- int32_t *tmpbuf,
- RestorationLineBuffers *rlbs) {
- (void)tile_rect;
+static AOM_INLINE void search_norestore(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)rest_unit_idx;
(void)tmpbuf;
(void)rlbs;
+ (void)error_info;
RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
- RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
const int highbd = rsc->cm->seq_params->use_highbitdepth;
- rusi->sse[RESTORE_NONE] = sse_restoration_unit(
+ rsc->sse[RESTORE_NONE] = sse_restoration_unit(
limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
- rsc->sse += rusi->sse[RESTORE_NONE];
+ rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE];
}
-static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
- const PixelRect *tile_rect,
- int rest_unit_idx, void *priv,
- int32_t *tmpbuf,
- RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_switchable(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
(void)limits;
- (void)tile_rect;
(void)tmpbuf;
(void)rlbs;
+ (void)error_info;
RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
@@ -1743,24 +1767,32 @@ static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
RestorationType best_rtype = RESTORE_NONE;
for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
- // Check for the condition that wiener or sgrproj search could not
- // find a solution or the solution was worse than RESTORE_NONE.
- // In either case the best_rtype will be set as RESTORE_NONE. These
- // should be skipped from the test below.
+ // If this restoration mode was skipped, or could not find a solution
+ // that was better than RESTORE_NONE, then we can't select it here either.
+ //
+ // Note: It is possible for the restoration search functions to find a
+ // filter which is better than RESTORE_NONE when looking purely at SSE, but
+ // for it to be rejected overall due to its rate cost. In this case, there
+ // is a chance that it may be have a lower rate cost when looking at
+ // RESTORE_SWITCHABLE, and so it might be acceptable here.
+ //
+ // Therefore we prune based on SSE, rather than on whether or not the
+ // previous search function selected this mode.
if (r > RESTORE_NONE) {
- if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue;
+ if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue;
}
- const int64_t sse = rusi->sse[r];
+ const int64_t sse = rsc->sse[r];
int64_t coeff_pcost = 0;
switch (r) {
case RESTORE_NONE: coeff_pcost = 0; break;
case RESTORE_WIENER:
- coeff_pcost =
- count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener);
+ coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener,
+ &rsc->switchable_ref_wiener);
break;
case RESTORE_SGRPROJ:
- coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj);
+ coeff_pcost =
+ count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj);
break;
default: assert(0); break;
}
@@ -1779,10 +1811,19 @@ static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
- rsc->sse += rusi->sse[best_rtype];
- rsc->bits += best_bits;
- if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener;
- if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info =
+ rsc->switchable_ref_wiener;
+ lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info =
+ rsc->switchable_ref_sgrproj;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype];
+ rsc->total_bits[RESTORE_SWITCHABLE] += best_bits;
+ if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener;
+ if (best_rtype == RESTORE_SGRPROJ)
+ rsc->switchable_ref_sgrproj = rusi->sgrproj;
}
static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
@@ -1796,23 +1837,96 @@ static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
rui->sgrproj_info = rusi->sgrproj;
}
-static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
+static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc,
+ bool *disable_lr_filter) {
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const CommonTileParams *tiles = &cm->tiles;
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ RestorationInfo *rsi = &cm->rst_info[plane];
+ const int ru_size = rsi->restoration_unit_size;
+ const int ext_size = ru_size * 3 / 2;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
search_norestore, search_wiener, search_sgrproj, search_switchable
};
- reset_rsc(rsc);
- rsc_on_tile(rsc);
+ const int plane_num_units = rsi->num_rest_units;
+ const RestorationType num_rtypes =
+ (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
- av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
- &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
- return RDCOST_DBL_WITH_NATIVE_BD_DIST(
- rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params->bit_depth);
-}
+ reset_rsc(rsc);
-static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
- const RestorationInfo *rsi = &cm->rst_info[plane];
- return rsi->units_per_tile;
+ // Iterate over restoration units in encoding order, so that each RU gets
+ // the correct reference parameters when we cost it up. This is effectively
+ // a nested iteration over:
+ // * Each tile, order does not matter
+ // * Each superblock within that tile, in raster order
+ // * Each LR unit which is coded within that superblock, in raster order
+ for (int tile_row = 0; tile_row < tiles->rows; tile_row++) {
+ int sb_row_start = tiles->row_start_sb[tile_row];
+ int sb_row_end = tiles->row_start_sb[tile_row + 1];
+ for (int tile_col = 0; tile_col < tiles->cols; tile_col++) {
+ int sb_col_start = tiles->col_start_sb[tile_col];
+ int sb_col_end = tiles->col_start_sb[tile_col + 1];
+
+ // Reset reference parameters for delta-coding at the start of each tile
+ rsc_on_tile(rsc);
+
+ for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) {
+ int mi_row = sb_row << mib_size_log2;
+ for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) {
+ int mi_col = sb_col << mib_size_log2;
+
+ int rcol0, rcol1, rrow0, rrow1;
+ int has_lr_info = av1_loop_restoration_corners_in_sb(
+ cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0,
+ &rrow1);
+
+ if (!has_lr_info) continue;
+
+ RestorationTileLimits limits;
+ for (int rrow = rrow0; rrow < rrow1; rrow++) {
+ int y0 = rrow * ru_size;
+ int remaining_h = plane_h - y0;
+ int h = (remaining_h < ext_size) ? remaining_h : ru_size;
+
+ limits.v_start = y0;
+ limits.v_end = y0 + h;
+ assert(limits.v_end <= plane_h);
+ // Offset upwards to align with the restoration processing stripe
+ const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+ limits.v_start = AOMMAX(0, limits.v_start - voffset);
+ if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+ for (int rcol = rcol0; rcol < rcol1; rcol++) {
+ int x0 = rcol * ru_size;
+ int remaining_w = plane_w - x0;
+ int w = (remaining_w < ext_size) ? remaining_w : ru_size;
+
+ limits.h_start = x0;
+ limits.h_end = x0 + w;
+ assert(limits.h_end <= plane_w);
+
+ const int unit_idx = rrow * rsi->horz_units + rcol;
+
+ rsc->skip_sgr_eval = 0;
+ for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) {
+ if (disable_lr_filter[r]) continue;
+
+ funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL,
+ cm->error);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
}
static INLINE void av1_derive_flags_for_lr_processing(
@@ -1833,31 +1947,101 @@ static INLINE void av1_derive_flags_for_lr_processing(
(is_wiener_disabled || is_sgr_disabled);
}
-void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->td.mb;
- const SequenceHeader *const seq_params = cm->seq_params;
- const int num_planes = av1_num_planes(cm);
- assert(!cm->features.all_lossless);
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+// Allocate both decoder-side and encoder-side info structs for a single plane.
+// The unit size passed in should be the minimum size which we are going to
+// search; before each search, set_restoration_unit_size() must be called to
+// configure the actual size.
+static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm,
+ RestorationInfo *rsi,
+ int is_uv,
+ int min_luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int sx = cm->seq_params.subsampling_x;
+ int sy = cm->seq_params.subsampling_y;
+ int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+ int s = 0;
+#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int min_unit_size = min_luma_unit_size >> s;
- av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
- int ntiles[2];
- for (int is_uv = 0; is_uv < 2; ++is_uv)
- ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
+ const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w);
+ const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h);
+ const int max_num_units = max_horz_units * max_vert_units;
+
+ aom_free(rsi->unit_info);
+ CHECK_MEM_ERROR(cm, rsi->unit_info,
+ (RestorationUnitInfo *)aom_memalign(
+ 16, sizeof(*rsi->unit_info) * max_num_units));
- assert(ntiles[1] <= ntiles[0]);
RestUnitSearchInfo *rusi;
CHECK_MEM_ERROR(
cm, rusi,
- (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]));
+ (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units));
// If the restoration unit dimensions are not multiples of
// rsi->restoration_unit_size then some elements of the rusi array may be
// left uninitialised when we reach copy_unit_info(...). This is not a
// problem, as these elements are ignored later, but in order to quiet
// Valgrind's warnings we initialise the array below.
- memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+ memset(rusi, 0, sizeof(*rusi) * max_num_units);
+
+ return rusi;
+}
+
+static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi,
+ int is_uv, int luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int sx = cm->seq_params.subsampling_x;
+ int sy = cm->seq_params.subsampling_y;
+ int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+ int s = 0;
+#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int unit_size = luma_unit_size >> s;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ const int horz_units = av1_lr_count_units(unit_size, plane_w);
+ const int vert_units = av1_lr_count_units(unit_size, plane_h);
+
+ rsi->restoration_unit_size = unit_size;
+ rsi->num_rest_units = horz_units * vert_units;
+ rsi->horz_units = horz_units;
+ rsi->vert_units = vert_units;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
+ const int num_planes = av1_num_planes(cm);
+ const int highbd = cm->seq_params->use_highbitdepth;
+ assert(!cm->features.all_lossless);
+
+ av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+
+ // Select unit size based on speed feature settings, and allocate
+ // rui structs based on this size
+ int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+ int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size;
+
+ // The minimum allowed unit size at a syntax level is 1 superblock.
+ // Apply this constraint here so that the speed features code which sets
+ // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size
+ min_lr_unit_size =
+ AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]);
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs(
+ cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size);
+ }
+
x->rdmult = cpi->rd.RDMULT;
// Allocate the frame buffer trial_frame_rst, which is used to temporarily
@@ -1865,33 +2049,32 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
if (aom_realloc_frame_buffer(
&cpi->trial_frame_rst, cm->superres_upscaled_width,
cm->superres_upscaled_height, seq_params->subsampling_x,
- seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_RESTORATION_FRAME_BORDER, cm->features.byte_alignment, NULL, NULL,
- NULL, 0, 0))
+ seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate trial restored frame buffer");
RestSearchCtxt rsc;
// The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers.
- // These buffers are required for AVX2 SIMD purpose only. Hence, allocated the
- // same if AVX2 variant of SIMD for av1_compute_stats() is enabled. The buffer
- // size required is calculated based on maximum width and height of the LRU
- // (i.e., from foreach_rest_unit_in_tile() 1.5 times the
- // RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width and
- // height aligned to multiple of 16 is considered for intrinsic purpose.
+ // These buffers are only required for the AVX2 and NEON implementations of
+ // av1_compute_stats. The buffer size required is calculated based on maximum
+ // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5
+ // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width
+ // and height aligned to multiple of 16 is considered for intrinsic purpose.
rsc.dgd_avg = NULL;
rsc.src_avg = NULL;
-#if HAVE_AVX2
+#if HAVE_AVX2 || HAVE_NEON
// The buffers allocated below are used during Wiener filter processing of low
// bitdepth path. Hence, allocate the same when Wiener filter is enabled in
// low bitdepth path.
- if (!cpi->sf.lpf_sf.disable_wiener_filter &&
- !cm->seq_params->use_highbitdepth) {
- const int buf_size = sizeof(*rsc.dgd_avg) * 6 * RESTORATION_UNITSIZE_MAX *
- RESTORATION_UNITSIZE_MAX;
- CHECK_MEM_ERROR(cm, rsc.dgd_avg, (int16_t *)aom_memalign(32, buf_size));
+ if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+ const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
+ RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+ CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg,
+ (int16_t *)aom_memalign(32, buf_size));
+ rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg;
// When LRU width isn't multiple of 16, the 256 bits load instruction used
// in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to
// silence Valgrind warning this buffer is initialized with zero. Overhead
@@ -1904,59 +2087,131 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
}
#endif
- const int plane_start = AOM_PLANE_Y;
- const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+ // Initialize all planes, so that any planes we skip searching will still have
+ // valid data
+ for (int plane = 0; plane < num_planes; plane++) {
+ cm->rst_info[plane].frame_restoration_type = RESTORE_NONE;
+ }
+
+ // Decide which planes to search
+ int plane_start, plane_end;
+
+ if (lpf_sf->disable_loop_restoration_luma) {
+ plane_start = AOM_PLANE_U;
+ } else {
+ plane_start = AOM_PLANE_Y;
+ }
+
+ if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) {
+ plane_end = AOM_PLANE_Y;
+ } else {
+ plane_end = AOM_PLANE_V;
+ }
// Derive the flags to enable/disable Loop restoration filters based on the
// speed features 'disable_wiener_filter' and 'disable_sgr_filter'.
bool disable_lr_filter[RESTORE_TYPES] = { false };
- const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter);
- for (int plane = plane_start; plane <= plane_end; ++plane) {
- init_rsc(src, &cpi->common, x, lpf_sf, plane, rusi, &cpi->trial_frame_rst,
- &rsc);
-
- const int plane_ntiles = ntiles[plane > 0];
- const RestorationType num_rtypes =
- (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
-
- double best_cost = 0;
- RestorationType best_rtype = RESTORE_NONE;
-
- const int highbd = rsc.cm->seq_params->use_highbitdepth;
- if ((plane && !lpf_sf->disable_loop_restoration_chroma) ||
- (!plane && !lpf_sf->disable_loop_restoration_luma)) {
- av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
- rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
- highbd);
+ for (int plane = plane_start; plane <= plane_end; plane++) {
+ const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+ const int is_uv = plane != AOM_PLANE_Y;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv],
+ RESTORATION_BORDER, RESTORATION_BORDER, highbd);
+ }
+ double best_cost = DBL_MAX;
+ int best_luma_unit_size = max_lr_unit_size;
+ for (int luma_unit_size = max_lr_unit_size;
+ luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) {
+ int64_t bits_this_size = 0;
+ int64_t sse_this_size = 0;
+ RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE,
+ RESTORE_NONE };
+ for (int plane = plane_start; plane <= plane_end; ++plane) {
+ set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+ luma_unit_size);
+ init_rsc(src, &cpi->common, x, lpf_sf, plane,
+ cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc);
+
+ restoration_search(cm, plane, &rsc, disable_lr_filter);
+
+ const int plane_num_units = cm->rst_info[plane].num_rest_units;
+ const RestorationType num_rtypes =
+ (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+ double best_cost_this_plane = DBL_MAX;
for (RestorationType r = 0; r < num_rtypes; ++r) {
// Disable Loop restoration filter based on the flags set using speed
// feature 'disable_wiener_filter' and 'disable_sgr_filter'.
if (disable_lr_filter[r]) continue;
- double cost = search_rest_type(&rsc, r);
+ double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r],
+ cm->seq_params->bit_depth);
- if (r == 0 || cost < best_cost) {
- best_cost = cost;
- best_rtype = r;
+ if (cost_this_plane < best_cost_this_plane) {
+ best_cost_this_plane = cost_this_plane;
+ best_rtype[plane] = r;
}
}
+
+ bits_this_size += rsc.total_bits[best_rtype[plane]];
+ sse_this_size += rsc.total_sse[best_rtype[plane]];
}
- cm->rst_info[plane].frame_restoration_type = best_rtype;
- if (best_rtype != RESTORE_NONE) {
- for (int u = 0; u < plane_ntiles; ++u) {
- copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
+ double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_this_size >> 4, sse_this_size,
+ cm->seq_params->bit_depth);
+
+ if (cost_this_size < best_cost) {
+ best_cost = cost_this_size;
+ best_luma_unit_size = luma_unit_size;
+ // Copy parameters out of rusi struct, before we overwrite it at
+ // the start of the next iteration
+ bool all_none = true;
+ for (int plane = plane_start; plane <= plane_end; ++plane) {
+ cm->rst_info[plane].frame_restoration_type = best_rtype[plane];
+ if (best_rtype[plane] != RESTORE_NONE) {
+ all_none = false;
+ const int plane_num_units = cm->rst_info[plane].num_rest_units;
+ for (int u = 0; u < plane_num_units; ++u) {
+ copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u],
+ &cm->rst_info[plane].unit_info[u]);
+ }
+ }
+ }
+ // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we
+ // couldn't find any good filters at this size. So we likely won't find
+ // any good filters at a smaller size either, so skip
+ if (all_none) {
+ break;
}
+ } else {
+ // Heuristic: If this size is worse than the previous (larger) size, then
+ // the next size down will likely be even worse, so skip
+ break;
}
}
-#if HAVE_AVX2
- if (!cpi->sf.lpf_sf.disable_wiener_filter &&
- !cm->seq_params->use_highbitdepth) {
- aom_free(rsc.dgd_avg);
+
+ // Final fixup to set the correct unit size
+ // We set this for all planes, even ones we have skipped searching,
+ // so that other code does not need to care which planes were and weren't
+ // searched
+ for (int plane = 0; plane < num_planes; ++plane) {
+ set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+ best_luma_unit_size);
+ }
+
+#if HAVE_AVX || HAVE_NEON
+ if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+ aom_free(cpi->pick_lr_ctxt.dgd_avg);
+ cpi->pick_lr_ctxt.dgd_avg = NULL;
}
#endif
- aom_free(rusi);
+ for (int plane = 0; plane < num_planes; plane++) {
+ aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+ cpi->pick_lr_ctxt.rusi[plane] = NULL;
+ }
}
diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index 94a6932de..d1d0b0cec 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h
@@ -20,6 +20,34 @@ extern "C" {
struct yv12_buffer_config;
struct AV1_COMP;
+// Enable extra debugging for loop restoration costing?
+//
+// If this is set to 1, then we record not just the selected LR parameters, but
+// also the values which the search process thinks they should be delta-coded
+// against. Then, when writing out the bitstream, we verify this information,
+// to help ensure that the search code is costing things properly
+#define DEBUG_LR_COSTING 0
+
+#if DEBUG_LR_COSTING
+#define MAX_LR_UNITS_W 64
+#define MAX_LR_UNITS_H 64
+
+// Storage for reference parameters.
+//
+// The storage size is determined by:
+// * This is always written and then checked within the same frame encode pass,
+// so we do not need to buffer multiple frames of data
+// * The parameters can be different per plane within one frame
+// * The relevant set of ref parameters can differ between the search where
+// we set the frame restoration mode to RESTORE_WIENER, and the search where
+// we set it to RESTORE_SWITCHABLE.
+// So we need to store at least two sets of Wiener params and two sets of
+// SGR params, and the easiest way to do this is to index by
+// frame_restoration_type
+extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+ [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif // DEBUG_LR_COSTING
+
static const uint8_t g_shuffle_stats_data[16] = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
};
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index fdf1495af..9062136aa 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -187,8 +187,8 @@ int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
assert(correction_factor <= MAX_BPB_FACTOR &&
correction_factor >= MIN_BPB_FACTOR);
- if (frame_type != KEY_FRAME && accurate_estimate) {
- assert(cpi->rec_sse != UINT64_MAX);
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type != KEY_FRAME &&
+ accurate_estimate && cpi->rec_sse != UINT64_MAX) {
const int mbs = cm->mi_params.MBs;
const double sse_sqrt =
(double)((int)sqrt((double)(cpi->rec_sse)) << BPER_MB_NORMBITS) /
@@ -334,7 +334,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height,
double framerate) {
// Assume we do not need any constraint lower than 4K 20 fps
static const double factor_safe = 3840 * 2160 * 20.0;
- const double factor = width * height * framerate;
+ const double factor = (double)width * height * framerate;
const int default_interval =
clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
@@ -453,12 +453,19 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
#else
int64_t buffer_level = p_rc->buffer_level;
#endif
-
- if (!oxcf->rc_cfg.drop_frames_water_mark) {
+ // Never drop on key frame, or for frame whose base layer is key.
+ // If drop_count_consec hits or exceeds max_consec_drop then don't drop.
+ if (cpi->common.current_frame.frame_type == KEY_FRAME ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+ !oxcf->rc_cfg.drop_frames_water_mark ||
+ (rc->max_consec_drop > 0 &&
+ rc->drop_count_consec >= rc->max_consec_drop)) {
return 0;
} else {
if (buffer_level < 0) {
// Always drop if buffer is below 0.
+ rc->drop_count_consec++;
return 1;
} else {
// If buffer is below drop_mark, for now just drop every other frame
@@ -473,6 +480,7 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
if (rc->decimation_factor > 0) {
if (rc->decimation_count > 0) {
--rc->decimation_count;
+ rc->drop_count_consec++;
return 1;
} else {
rc->decimation_count = rc->decimation_factor;
@@ -493,8 +501,16 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
const AV1_COMMON *const cm = &cpi->common;
const SVC *const svc = &cpi->svc;
const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ // Flag to indicate previous frame has overshoot, and buffer level
+ // for current frame is low (less than ~half of optimal). For such
+ // (inter) frames, if the source_sad is non-zero, relax the max_delta_up
+ // and clamp applied below.
+ const bool overshoot_buffer_low =
+ cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 &&
+ p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) &&
+ rc->frames_since_key > 4;
int max_delta_down;
- int max_delta_up = 20;
+ int max_delta_up = overshoot_buffer_low ? 60 : 20;
const int change_avg_frame_bandwidth =
abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
0.1 * (rc->avg_frame_bandwidth);
@@ -543,7 +559,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
// not been set due to dropped frames.
if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 &&
- rc->q_2_frame > 0) {
+ rc->q_2_frame > 0 && !overshoot_buffer_low) {
int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
AOMMAX(rc->q_1_frame, rc->q_2_frame));
// If the previous frame had overshoot and the current q needs to
@@ -2069,6 +2085,13 @@ static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) {
// TODO(yunqing): support scaled reference frames.
if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return;
+ for (int i = 0; i < 2; ++i) {
+ if (unscaled_src->widths[i] != yv12->widths[i] ||
+ unscaled_src->heights[i] != yv12->heights[i]) {
+ return;
+ }
+ }
+
const int num_mi_cols = cm->mi_params.mi_cols;
const int num_mi_rows = cm->mi_params.mi_rows;
const BLOCK_SIZE bsize = BLOCK_64X64;
@@ -2231,7 +2254,8 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height);
// Update bit estimation ratio.
- if (cm->current_frame.frame_type != KEY_FRAME &&
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ cm->current_frame.frame_type != KEY_FRAME &&
cpi->sf.hl_sf.accurate_bit_estimate) {
const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
cm->seq_params->bit_depth);
@@ -2343,6 +2367,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
rc->prev_coded_height = cm->height;
rc->frame_number_encoded++;
rc->prev_frame_is_dropped = 0;
+ rc->drop_count_consec = 0;
// if (current_frame->frame_number == 1 && cm->show_frame)
/*
rc->this_frame_target =
@@ -2957,10 +2982,8 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
}
if (width != cm->render_width || height != cm->render_height ||
unscaled_src == NULL || unscaled_last_src == NULL) {
- if (cpi->src_sad_blk_64x64) {
- aom_free(cpi->src_sad_blk_64x64);
- cpi->src_sad_blk_64x64 = NULL;
- }
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
}
if (unscaled_src == NULL || unscaled_last_src == NULL) return;
src_y = unscaled_src->y_buffer;
@@ -2972,10 +2995,8 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
last_src_width = unscaled_last_src->y_width;
last_src_height = unscaled_last_src->y_height;
if (src_width != last_src_width || src_height != last_src_height) {
- if (cpi->src_sad_blk_64x64) {
- aom_free(cpi->src_sad_blk_64x64);
- cpi->src_sad_blk_64x64 = NULL;
- }
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
return;
}
rc->high_source_sad = 0;
@@ -2990,13 +3011,18 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
}
int num_zero_temp_sad = 0;
uint32_t min_thresh = 10000;
- if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) min_thresh = 100000;
+ if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+ min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0
+ ? 50000
+ : 100000;
+ }
const BLOCK_SIZE bsize = BLOCK_64X64;
// Loop over sub-sample of frame, compute average sad over 64x64 blocks.
uint64_t avg_sad = 0;
uint64_t tmp_sad = 0;
int num_samples = 0;
- const int thresh = 6;
+ const int thresh =
+ cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 5 : 6;
// SAD is computed on 64x64 blocks
const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
? (cm->seq_params->mib_size >> 1)
@@ -3127,6 +3153,10 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
int qindex;
double tot_scale_change = (double)(resize_width * resize_height) /
(double)(prev_width * prev_height);
+ // Disable the skip mv search for svc on resize frame.
+ svc->skip_mvsearch_last = 0;
+ svc->skip_mvsearch_gf = 0;
+ svc->skip_mvsearch_altref = 0;
// Reset buffer level to optimal, update target size.
p_rc->buffer_level = p_rc->optimal_buffer_level;
p_rc->bits_off_target = p_rc->optimal_buffer_level;
@@ -3386,7 +3416,7 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
if (rc->prev_coded_width == cm->width &&
rc->prev_coded_height == cm->height) {
rc_scene_detection_onepass_rt(cpi, frame_input);
- } else if (cpi->src_sad_blk_64x64) {
+ } else {
aom_free(cpi->src_sad_blk_64x64);
cpi->src_sad_blk_64x64 = NULL;
}
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 4fb11791c..6802ad42d 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -205,6 +205,8 @@ typedef struct {
int decimation_factor;
int decimation_count;
int prev_frame_is_dropped;
+ int drop_count_consec;
+ int max_consec_drop;
/*!
* Frame number for encoded frames (non-dropped).
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 8bc7d1b29..c2d76e7a9 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -809,10 +809,11 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
// Frame level dv cost update
if (av1_need_dv_costs(cpi)) {
- if (cpi->td.mb.dv_costs == NULL) {
+ if (cpi->td.dv_costs_alloc == NULL) {
CHECK_MEM_ERROR(
- cm, cpi->td.mb.dv_costs,
- (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.mb.dv_costs)));
+ cm, cpi->td.dv_costs_alloc,
+ (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc)));
+ cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc;
}
av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs);
}
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8620087db..c17fbccf8 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1322,7 +1322,8 @@ static int64_t motion_mode_rd(
const int mi_col = xd->mi_col;
int mode_index_start, mode_index_end;
const int txfm_rd_gate_level =
- get_txfm_rd_gate_level(cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
TX_SEARCH_MOTION_MODE, eval_motion_mode);
// Modify the start and end index according to speed features. For example,
@@ -1656,16 +1657,16 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
// Call av1_enc_build_inter_predictor() for one plane at a time.
av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
plane, plane);
- const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const BLOCK_SIZE plane_bsize =
get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
av1_subtract_plane(x, plane_bsize, plane);
- int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
- sse >>= ((cpi->frame_info.bit_depth - 8) * 2);
+
+ int64_t sse =
+ av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL);
+ if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+ sse <<= 4;
total_sse += sse;
// When current rd cost is more than the best rd, skip evaluation of
// remaining planes.
@@ -2055,6 +2056,9 @@ static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
HandleInterModeArgs *const args,
int64_t ref_best_rd, BLOCK_SIZE bsize,
const int ref_set) {
+ // If the number of ref mv count is equal to 1, do not prune the same. It
+ // is better to evaluate the same than to prune it.
+ if (ref_set == 1) return 1;
AV1_COMMON *const cm = &cpi->common;
const MACROBLOCKD *const xd = &x->e_mbd;
const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -2833,17 +2837,6 @@ static int64_t handle_inter_mode(
const int base_rate =
args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
- // As per the experiments, in real-time preset impact of model rd based
- // breakouts is less on encoding time if the following conditions are true.
- // (1) compound mode is disabled
- // (2) interpolation filter search is disabled
- // TODO(any): Check the impact of model rd based breakouts in other presets
- const int skip_interp_search_modelrd_calc =
- cpi->oxcf.mode == REALTIME &&
- cm->current_frame.reference_mode == SINGLE_REFERENCE &&
- (cpi->sf.rt_sf.skip_interp_filter_search ||
- cpi->sf.winner_mode_sf.winner_mode_ifs);
-
for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
save_mv[i][0].as_int = INVALID_MV;
save_mv[i][1].as_int = INVALID_MV;
@@ -2993,7 +2986,7 @@ static int64_t handle_inter_mode(
if (not_best_mode) continue;
}
- if (!skip_interp_search_modelrd_calc) {
+ if (!args->skip_ifs) {
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, interpolation_filter_search_time);
#endif
@@ -3125,7 +3118,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
int64_t best_rd) {
const AV1_COMMON *const cm = &cpi->common;
if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc ||
- cpi->sf.rt_sf.use_nonrd_pick_mode)
+ !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode)
return INT64_MAX;
const int num_planes = av1_num_planes(cm);
@@ -3188,12 +3181,14 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
const search_site_config *lookahead_search_sites =
cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
&dv_ref.as_mv, start_mv,
- lookahead_search_sites,
+ lookahead_search_sites, search_method,
/*fine_search_interval=*/0);
const IntraBCMVCosts *const dv_costs = x->dv_costs;
av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
@@ -3242,9 +3237,11 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
const int step_param = cpi->mv_search_params.mv_step_param;
IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
int_mv best_mv, best_hash_mv;
+ FULLPEL_MV_STATS best_mv_stats;
- int bestsme = av1_full_pixel_search(start_mv, &fullms_params, step_param,
- NULL, &best_mv.as_fullmv, NULL);
+ int bestsme =
+ av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL,
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
const int hashsme = av1_intrabc_hash_search(
cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv);
if (hashsme < bestsme) {
@@ -3780,6 +3777,7 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
MB_MODE_INFO *const mbmi = xd->mi[0];
unsigned char segment_id = mbmi->segment_id;
const SPEED_FEATURES *const sf = &cpi->sf;
+ const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
REF_SET ref_set = REF_SET_FULL;
if (sf->rt_sf.use_real_time_ref_set)
@@ -3852,7 +3850,7 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
}
if (cpi->rc.is_src_frame_alt_ref) {
- if (sf->inter_sf.alt_ref_search_fp &&
+ if (inter_sf->alt_ref_search_fp &&
(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) {
mask->pred_modes[ALTREF_FRAME] = 0;
disable_inter_references_except_altref(mask->ref_combo);
@@ -3860,19 +3858,19 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
}
}
- if (sf->inter_sf.alt_ref_search_fp) {
+ if (inter_sf->alt_ref_search_fp) {
if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) {
int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3);
// Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
// those are past frames
MV_REFERENCE_FRAME start_frame =
- sf->inter_sf.alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
+ inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) {
if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
0) {
// Prune inter modes when relative dist of ALTREF2 and ALTREF is close
// to the relative dist of LAST_FRAME.
- if (sf->inter_sf.alt_ref_search_fp == 1 &&
+ if (inter_sf->alt_ref_search_fp == 1 &&
(abs(cpi->ref_frame_dist_info
.ref_relative_dist[ref_frame - LAST_FRAME]) >
1.5 * abs(cpi->ref_frame_dist_info
@@ -3913,6 +3911,33 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
mask->pred_modes[INTRA_FRAME] |=
~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
+
+ // Prune reference frames which are not the closest to the current
+ // frame and with large pred_mv_sad.
+ if (inter_sf->prune_single_ref) {
+ assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3);
+ const double prune_threshes[2] = { 1.20, 1.05 };
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefFrameDistanceInfo *const ref_frame_dist_info =
+ &cpi->ref_frame_dist_info;
+ const int is_closest_ref =
+ (ref_frame == ref_frame_dist_info->nearest_past_ref) ||
+ (ref_frame == ref_frame_dist_info->nearest_future_ref);
+
+ if (!is_closest_ref) {
+ const int dir =
+ (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+ ? 0
+ : 1;
+ if (x->best_pred_mv_sad[dir] < INT_MAX &&
+ x->pred_mv_sad[ref_frame] >
+ prune_threshes[inter_sf->prune_single_ref - 1] *
+ x->best_pred_mv_sad[dir])
+ mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL;
+ }
+ }
+ }
}
static AOM_INLINE void init_neighbor_pred_buf(
@@ -4025,6 +4050,7 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
}
if (cpi->sf.inter_sf.alt_ref_search_fp ||
+ cpi->sf.inter_sf.prune_single_ref ||
cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
// Store the best pred_mv_sad across all past frames
if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
@@ -4995,8 +5021,16 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
*(args->skip_motion_mode) = (ret == 2);
// We've reached the first compound prediction mode, get stats from the
- // single reference predictors to help with pruning
- if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
+ // single reference predictors to help with pruning.
+ // Disable this pruning logic if interpolation filter search was skipped for
+ // single prediction modes as it can result in aggressive pruning of compound
+ // prediction modes due to the absence of modelled_rd populated by
+ // av1_interpolation_filter_search().
+ // TODO(Remya): Check the impact of the sf
+ // 'prune_comp_search_by_single_result' if compound prediction modes are
+ // enabled in future for REALTIME encode.
+ if (!sf->interp_sf.skip_interp_filter_search &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
args->reach_first_comp_mode == 0) {
analyze_single_states(cpi, args->search_state);
args->reach_first_comp_mode = 1;
@@ -5014,7 +5048,8 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
// Skip this compound mode based on the RD results from the single prediction
// modes
- if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
+ if (!sf->interp_sf.skip_interp_filter_search &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
if (compound_skip_by_single_states(cpi, args->search_state, this_mode,
ref_frame, second_ref_frame, x))
return 1;
@@ -5203,6 +5238,7 @@ static void tx_search_best_inter_candidates(
const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
int64_t skip_rd = INT64_MAX;
const int txfm_rd_gate_level = get_txfm_rd_gate_level(
+ cm->seq_params->enable_masked_compound,
cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT,
/*eval_motion_mode=*/0);
if (txfm_rd_gate_level) {
@@ -5665,6 +5701,20 @@ static AOM_INLINE void skip_intra_modes_in_interframe(
}
}
+static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi,
+ int is_single_pred) {
+ const MODE encoding_mode = cpi->oxcf.mode;
+ if (encoding_mode == REALTIME) {
+ return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE &&
+ (cpi->sf.interp_sf.skip_interp_filter_search ||
+ cpi->sf.winner_mode_sf.winner_mode_ifs));
+ } else if (encoding_mode == GOOD) {
+ // Skip interpolation filter search for single prediction modes.
+ return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred);
+ }
+ return false;
+}
+
static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi,
const MACROBLOCK *x,
BLOCK_SIZE bsize) {
@@ -5727,6 +5777,7 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
INT_MAX,
search_state.simple_rd,
0,
+ false,
interintra_modes,
{ { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
{ { 0, 0 } },
@@ -5960,6 +6011,7 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
args.single_comp_cost = real_compmode_cost;
args.ref_frame_cost = ref_frame_cost;
args.best_pred_sse = search_state.best_pred_sse;
+ args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred);
int64_t skip_rd[2] = { search_state.best_skip_rd[0],
search_state.best_skip_rd[1] };
@@ -5976,7 +6028,8 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
end_timing(cpi, handle_inter_mode_time);
#endif
if (current_frame->reference_mode != SINGLE_REFERENCE) {
- if (sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+ if (!args.skip_ifs &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 &&
is_inter_singleref_mode(this_mode)) {
collect_single_states(x, &search_state, mbmi);
}
@@ -6007,6 +6060,10 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
&rd_stats_uv, mode_enum, x, do_tx_search);
if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
+ // skip_rd[0] is the best total rd for a skip mode so far.
+ // skip_rd[1] is the best total rd for a skip mode so far in luma.
+ // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated.
+ // When do_tx_search = 0, skip_rd[1] is updated.
search_state.best_skip_rd[1] = skip_rd[1];
}
if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 1c5b3db37..b6bc4927e 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -774,12 +774,18 @@ static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
// Get transform rd gate level for the given transform search case.
static INLINE int get_txfm_rd_gate_level(
+ const int is_masked_compound_enabled,
const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize,
TX_SEARCH_CASE tx_search_case, int eval_motion_mode) {
assert(tx_search_case < TX_SEARCH_CASES);
if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode &&
num_pels_log2_lookup[bsize] > 8)
return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE];
+ // Enable aggressive gating of transform search only when masked compound type
+ // is enabled.
+ else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE &&
+ is_masked_compound_enabled)
+ return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE];
return txfm_rd_gate_level[TX_SEARCH_DEFAULT];
}
diff --git a/av1/encoder/saliency_map.c b/av1/encoder/saliency_map.c
index 33768464b..30019bbec 100644
--- a/av1/encoder/saliency_map.c
+++ b/av1/encoder/saliency_map.c
@@ -1261,7 +1261,7 @@ int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio) {
return 0;
}
- const int bsize = cm->seq_params->sb_size;
+ const BLOCK_SIZE bsize = cm->seq_params->sb_size;
const int num_mi_w = mi_size_wide[bsize];
const int num_mi_h = mi_size_high[bsize];
const int block_width = block_size_wide[bsize];
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index c432e4275..830d2c6df 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -847,6 +847,14 @@ static void set_good_speed_feature_framesize_dependent(
} else {
sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2;
}
+
+ if (is_720p_or_larger)
+ sf->part_sf.ext_part_eval_based_on_cur_best =
+ (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1;
+
+ if (is_480p_or_larger) {
+ sf->tpl_sf.reduce_num_frames = 1;
+ }
}
if (speed >= 6) {
@@ -863,6 +871,10 @@ static void set_good_speed_feature_framesize_dependent(
sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
}
+ if (is_480p_or_larger) {
+ sf->hl_sf.allow_sub_blk_me_in_tf = 1;
+ }
+
if (is_1080p_or_larger) {
sf->part_sf.default_min_partition_size = BLOCK_8X8;
}
@@ -972,6 +984,9 @@ static void set_good_speed_features_framesize_independent(
sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
if (speed >= 1) {
+ sf->hl_sf.adjust_num_frames_for_arf_filtering =
+ allow_screen_content_tools ? 0 : 1;
+
sf->part_sf.intra_cnn_based_part_prune_level =
allow_screen_content_tools ? 0 : 2;
sf->part_sf.simple_motion_search_early_term_none = 1;
@@ -1100,6 +1115,7 @@ static void set_good_speed_features_framesize_independent(
sf->mv_sf.search_method = DIAMOND;
sf->mv_sf.disable_second_mv = 2;
sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1;
+ sf->mv_sf.use_intrabc = 0;
sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
@@ -1173,6 +1189,7 @@ static void set_good_speed_features_framesize_independent(
sf->inter_sf.alt_ref_search_fp = 2;
sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3;
sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3;
sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
sf->inter_sf.prune_ext_comp_using_neighbors = 2;
@@ -1197,6 +1214,7 @@ static void set_good_speed_features_framesize_independent(
sf->tpl_sf.subpel_force_stop = HALF_PEL;
sf->tpl_sf.search_method = FAST_BIGDIA;
+ sf->tpl_sf.use_sad_for_mode_decision = 1;
sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
@@ -1213,6 +1231,8 @@ static void set_good_speed_features_framesize_independent(
if (speed >= 5) {
sf->hl_sf.weight_calc_level_in_tf = 1;
+ sf->hl_sf.adjust_num_frames_for_arf_filtering =
+ allow_screen_content_tools ? 0 : 2;
sf->fp_sf.reduce_mv_step_param = 4;
@@ -1223,15 +1243,18 @@ static void set_good_speed_features_framesize_independent(
sf->part_sf.ext_partition_eval_thresh =
allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
sf->part_sf.prune_sub_8x8_partition_level =
- (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
- : 2;
+ allow_screen_content_tools ? 1 : 2;
sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND;
sf->inter_sf.prune_inter_modes_if_skippable = 1;
+ sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1;
sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5;
sf->inter_sf.enable_fast_compound_mode_search = 2;
+ sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1;
+
sf->intra_sf.chroma_intra_pruning_with_hog = 3;
// TODO(any): Extend multi-winner mode processing support for inter frames
@@ -1247,6 +1270,7 @@ static void set_good_speed_features_framesize_independent(
sf->tpl_sf.use_y_only_rate_distortion = 1;
sf->tpl_sf.subpel_force_stop = FULL_PEL;
sf->tpl_sf.gop_length_decision_method = 2;
+ sf->tpl_sf.use_sad_for_mode_decision = 2;
sf->winner_mode_sf.dc_blk_pred_level = 2;
@@ -1256,11 +1280,10 @@ static void set_good_speed_features_framesize_independent(
if (speed >= 6) {
sf->hl_sf.disable_extra_sc_testing = 1;
sf->hl_sf.second_alt_ref_filtering = 0;
- sf->hl_sf.adjust_num_frames_for_arf_filtering =
- allow_screen_content_tools ? 0 : 1;
sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
sf->inter_sf.selective_ref_frame = 6;
+ sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2;
sf->inter_sf.prune_ext_comp_using_neighbors = 3;
sf->intra_sf.chroma_intra_pruning_with_hog = 4;
@@ -1273,10 +1296,7 @@ static void set_good_speed_features_framesize_independent(
sf->part_sf.prune_rectangular_split_based_on_qidx =
boosted || allow_screen_content_tools ? 0 : 2;
- sf->part_sf.prune_sub_8x8_partition_level =
- allow_screen_content_tools ? 0
- : frame_is_intra_only(&cpi->common) ? 1
- : 2;
+
sf->part_sf.prune_part4_search = 3;
sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
@@ -1488,11 +1508,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
cpi->svc.number_temporal_layers > 1)
sf->hl_sf.accurate_bit_estimate = 0;
- // TODO(yunqingwang@google.com): test to see if
- // estimate_motion_for_var_based_partition == 2 helps here.
- if (sf->rt_sf.estimate_motion_for_var_based_partition == 2)
- sf->rt_sf.estimate_motion_for_var_based_partition = 1;
- if (speed >= 9) sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ sf->rt_sf.estimate_motion_for_var_based_partition = 1;
// For single layers RPS: bias/adjustment for recovery frame.
if (cpi->ppi->rtc_ref.bias_recovery_frame) {
@@ -1509,6 +1525,8 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
sf->mv_sf.use_bsize_dependent_search_method = 0;
sf->rt_sf.skip_cdef_sb = 1;
+ sf->rt_sf.increase_color_thresh_palette = 1;
+ if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1;
}
if (speed >= 8) {
sf->rt_sf.nonrd_check_partition_merge_mode = 3;
@@ -1542,6 +1560,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
sf->rt_sf.part_early_exit_zeromv = 2;
sf->rt_sf.prune_palette_nonrd = 1;
sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+ sf->rt_sf.increase_color_thresh_palette = 0;
}
sf->rt_sf.use_nonrd_altref_frame = 0;
sf->rt_sf.use_rtc_tf = 0;
@@ -1565,12 +1584,11 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
}
sf->rt_sf.partition_direct_merging = 0;
sf->hl_sf.accurate_bit_estimate = 0;
-
- // "sf->rt_sf.estimate_motion_for_var_based_partition = 2" doesn't work well
- // for screen contents.
- if (sf->rt_sf.estimate_motion_for_var_based_partition == 2)
+ // This feature is for nonrd_pickmode and non-svc for now.
+ if (sf->rt_sf.use_nonrd_pick_mode && !cpi->ppi->use_svc)
sf->rt_sf.estimate_motion_for_var_based_partition = 1;
- if (speed >= 9) sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ else
+ sf->rt_sf.estimate_motion_for_var_based_partition = 0;
}
if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
sf->rt_sf.use_rtc_tf = 0;
@@ -1608,6 +1626,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
sf->interp_sf.cb_pred_filter_search = 0;
+ sf->interp_sf.skip_interp_filter_search = 1;
sf->part_sf.ml_prune_partition = 1;
sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
sf->part_sf.prune_ext_partition_types_search_level = 2;
@@ -1619,7 +1638,6 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
// Disable Wiener and Self-guided Loop restoration filters.
sf->lpf_sf.disable_wiener_filter = true;
sf->lpf_sf.disable_sgr_filter = true;
- sf->rt_sf.skip_interp_filter_search = 1;
sf->intra_sf.prune_palette_search_level = 2;
sf->intra_sf.prune_luma_palette_size_search_level = 2;
sf->intra_sf.early_term_chroma_palette_size_search = 1;
@@ -1785,6 +1803,8 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
// This sf is not applicable in non-rd path.
sf->inter_sf.skip_newmv_in_drl = 0;
+ sf->interp_sf.skip_interp_filter_search = 0;
+
// Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
// good. May need more study.
for (int i = 0; i < TX_SIZES; ++i) {
@@ -1810,7 +1830,6 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0);
#endif
sf->rt_sf.short_circuit_low_temp_var = 0;
- sf->rt_sf.skip_interp_filter_search = 0;
// For spatial layers, only LAST and GOLDEN are currently used in the SVC
// for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
// get_ref_frame_flags() for some patterns, so disable it here for
@@ -1872,6 +1891,10 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
sf->rt_sf.var_part_split_threshold_shift = 10;
sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
}
+ if (speed >= 11 && !frame_is_intra_only(cm) &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ sf->winner_mode_sf.dc_blk_pred_level = 3;
+ }
}
static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
@@ -1887,6 +1910,7 @@ static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
hl_sf->adjust_num_frames_for_arf_filtering = 0;
hl_sf->accurate_bit_estimate = 0;
hl_sf->weight_calc_level_in_tf = 0;
+ hl_sf->allow_sub_blk_me_in_tf = 0;
}
static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
@@ -1907,6 +1931,8 @@ static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
tpl_sf->prune_ref_frames_in_tpl = 0;
tpl_sf->allow_compound_pred = 1;
tpl_sf->use_y_only_rate_distortion = 0;
+ tpl_sf->use_sad_for_mode_decision = 0;
+ tpl_sf->reduce_num_frames = 0;
}
static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
@@ -1948,6 +1974,7 @@ static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
part_sf->intra_cnn_based_part_prune_level = 0;
part_sf->ext_partition_eval_thresh = BLOCK_8X8;
part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+ part_sf->ext_part_eval_based_on_cur_best = 0;
part_sf->prune_ext_part_using_split_info = 0;
part_sf->prune_rectangular_split_based_on_qidx = 0;
part_sf->prune_rect_part_using_4x4_var_deviation = false;
@@ -1983,6 +2010,7 @@ static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
mv_sf->skip_fullpel_search_using_startmv = 0;
mv_sf->warp_search_method = WARP_SEARCH_SQUARE;
mv_sf->warp_search_iters = 8;
+ mv_sf->use_intrabc = 1;
}
static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
@@ -1990,6 +2018,7 @@ static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
inter_sf->model_based_post_interp_filter_breakout = 0;
inter_sf->reduce_inter_modes = 0;
inter_sf->alt_ref_search_fp = 0;
+ inter_sf->prune_single_ref = 0;
inter_sf->prune_comp_ref_frames = 0;
inter_sf->selective_ref_frame = 0;
inter_sf->prune_ref_frame_for_rect_partitions = 0;
@@ -2042,6 +2071,7 @@ static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
interp_sf->skip_sharp_interp_filter_search = 0;
interp_sf->use_fast_interpolation_filter_search = 0;
interp_sf->use_interp_filter = 0;
+ interp_sf->skip_interp_filter_search = 0;
}
static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
@@ -2139,6 +2169,8 @@ static AOM_INLINE void init_winner_mode_sf(
static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
lpf_sf->disable_loop_restoration_chroma = 0;
lpf_sf->disable_loop_restoration_luma = 0;
+ lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+ lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
lpf_sf->prune_wiener_based_on_src_var = 0;
lpf_sf->prune_sgr_based_on_wiener = 0;
lpf_sf->enable_sgr_ep_pruning = 0;
@@ -2172,7 +2204,6 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
rt_sf->num_inter_modes_for_tx_search = INT_MAX;
rt_sf->use_nonrd_filter_search = 0;
rt_sf->use_simple_rd_model = 0;
- rt_sf->skip_interp_filter_search = 0;
rt_sf->hybrid_intra_pickmode = 0;
rt_sf->source_metrics_sb_nonrd = 0;
rt_sf->overshoot_detection_cbr = NO_DETECTION;
@@ -2200,6 +2231,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
rt_sf->use_rtc_tf = 0;
rt_sf->prune_idtx_nonrd = 0;
rt_sf->prune_palette_nonrd = 0;
+ rt_sf->dct_only_palette_nonrd = 0;
rt_sf->part_early_exit_zeromv = 0;
rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
rt_sf->skip_lf_screen = 0;
@@ -2222,6 +2254,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
rt_sf->screen_content_cdef_filter_qindex_thresh = 0;
rt_sf->enable_ref_short_signaling = false;
rt_sf->check_globalmv_on_single_ref = true;
+ rt_sf->increase_color_thresh_palette = false;
}
static fractional_mv_step_fp
@@ -2454,9 +2487,11 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
SPEED_FEATURES *const sf = &cpi->sf;
WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
const int boosted = frame_is_boosted(cpi);
+ const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440;
const int is_arf2_bwd_type =
cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
@@ -2508,23 +2543,36 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
if (speed >= 2) {
// Disable extended partitions for lower quantizers
- const int aggr = AOMMIN(3, speed - 2);
+ const int aggr = AOMMIN(4, speed - 2);
const int qindex_thresh1[4] = { 50, 50, 80, 100 };
const int qindex_thresh2[4] = { 80, 100, 120, 160 };
int qindex_thresh;
- int disable_ext_part;
if (aggr <= 1) {
const int qthresh2 =
(!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
qindex_thresh = cm->features.allow_screen_content_tools
? qindex_thresh1[aggr]
: qthresh2;
- disable_ext_part = !boosted;
- } else {
+ if (cm->quant_params.base_qindex <= qindex_thresh && !boosted)
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (aggr <= 2) {
qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
- disable_ext_part = !frame_is_intra_only(cm);
- }
- if (cm->quant_params.base_qindex <= qindex_thresh && disable_ext_part) {
+ if (cm->quant_params.base_qindex <= qindex_thresh &&
+ !frame_is_intra_only(cm))
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (aggr <= 3) {
+ if (!is_480p_or_larger) {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (!is_720p_or_larger && !frame_is_intra_only(cm) &&
+ !cm->features.allow_screen_content_tools) {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else {
+ qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+ if (cm->quant_params.base_qindex <= qindex_thresh &&
+ !frame_is_intra_only(cm))
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ }
+ } else {
sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
}
}
@@ -2593,6 +2641,45 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
}
}
+ if (speed >= 5) {
+ // Disable the sf for low quantizers in case of low resolution screen
+ // contents.
+ if (cm->features.allow_screen_content_tools &&
+ cm->quant_params.base_qindex < 128 && is_480p_or_lesser) {
+ sf->part_sf.prune_sub_8x8_partition_level = 0;
+ }
+ }
+
+ // Loop restoration size search
+ // At speed 0, always search all available sizes for the maximum possible gain
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+
+ if (speed >= 1) {
+ // For large frames, small restoration units are almost never useful,
+ // so prune them away
+ if (is_1440p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ } else if (is_720p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ }
+ }
+
+ if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) {
+ // At this speed, a full search is too expensive. Instead, pick a single
+ // size based on size and qindex. Note that, in general, higher quantizers
+ // (== lower quality) and larger frames generally want to use larger
+ // restoration units.
+ int qindex_thresh = 96;
+ if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ } else {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ }
+ }
+
set_subpel_search_method(&cpi->mv_search_params,
cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
sf->mv_sf.subpel_search_method);
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 27a07c5f1..14cd8743a 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -92,6 +92,8 @@ enum {
(1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
(1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
(1 << NEAR_NEARMV),
+ INTER_SINGLE_ALL =
+ (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV),
};
enum {
@@ -240,11 +242,14 @@ enum {
} UENUM1BYTE(PRUNE_NEARMV_LEVEL);
enum {
- // Default Transform search case - used in evaluation of compound type mode
- // and best inter candidates
+ // Default transform search used in evaluation of best inter candidates
+ // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL
+ // stage).
TX_SEARCH_DEFAULT = 0,
- // Transform search in motion mode rd
+ // Transform search in motion mode rd during MODE_EVAL stage.
TX_SEARCH_MOTION_MODE,
+ // Transform search in compound type mode rd during MODE_EVAL stage.
+ TX_SEARCH_COMP_TYPE_MODE,
// All transform search cases
TX_SEARCH_CASES
} UENUM1BYTE(TX_SEARCH_CASE);
@@ -448,7 +453,11 @@ typedef struct HIGH_LEVEL_SPEED_FEATURES {
/*!
* The number of frames to be used during temporal filtering of an ARF frame
- * is adjusted based on noise level of the current frame.
+ * is adjusted based on noise level of the current frame. The sf has three
+ * levels to decide number of frames to be considered for filtering:
+ * 0 : Use default number of frames
+ * 1 and 2 : Reduce the number of frames based on noise level with varied
+ * aggressiveness
*/
int adjust_num_frames_for_arf_filtering;
@@ -465,6 +474,14 @@ typedef struct HIGH_LEVEL_SPEED_FEATURES {
* 1: Calculate weight using a lookup table that approximates exp().
*/
int weight_calc_level_in_tf;
+
+ /*!
+ * Decide whether to perform motion estimation at split block (i.e. 16x16)
+ * level or not.
+ * 0: Always allow motion estimation.
+ * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance.
+ */
+ int allow_sub_blk_me_in_tf;
} HIGH_LEVEL_SPEED_FEATURES;
/*!
@@ -536,6 +553,19 @@ typedef struct TPL_SPEED_FEATURES {
// Calculate rate and distortion based on Y plane only.
int use_y_only_rate_distortion;
+
+ // Use SAD instead of SATD during intra/inter mode search.
+ // If set to 0, use SATD always.
+ // If set to 1, use SAD during intra/inter mode search for frames in the
+ // higher temporal layers of the hierarchical prediction structure.
+ // If set to 2, use SAD during intra/inter mode search for all frames.
+ // This sf is disabled for the first GF group of the key-frame interval,
+ // i.e., SATD is used during intra/inter mode search of the first GF group.
+ int use_sad_for_mode_decision;
+
+ // Skip tpl processing for frames of type LF_UPDATE.
+ // This sf is disabled for the first GF group of the key-frame interval.
+ int reduce_num_frames;
} TPL_SPEED_FEATURES;
typedef struct GLOBAL_MOTION_SPEED_FEATURES {
@@ -567,9 +597,10 @@ typedef struct PARTITION_SPEED_FEATURES {
// Used if partition_search_type = FIXED_PARTITION
BLOCK_SIZE fixed_partition_size;
- // Prune extended partition types search
- // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
- // aggressiveness of pruning in order.
+ // Prune extended partition types search based on the current best partition
+ // and the combined rdcost of the subblocks estimated from previous
+ // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2
+ // increasing aggressiveness of pruning in order.
int prune_ext_partition_types_search_level;
// Prune part4 based on block size
@@ -654,13 +685,18 @@ typedef struct PARTITION_SPEED_FEATURES {
// 2: Prune none, split and rectangular partitions
int intra_cnn_based_part_prune_level;
- // Disable extended partition search for lower block sizes.
- int ext_partition_eval_thresh;
+ // Disable extended partition search if the current bsize is greater than the
+ // threshold. Must be a square block size BLOCK_8X8 or higher.
+ BLOCK_SIZE ext_partition_eval_thresh;
+
+ // Use best partition decision so far to tune 'ext_partition_eval_thresh'
+ int ext_part_eval_based_on_cur_best;
// Disable rectangular partitions for larger block sizes.
int rect_partition_eval_thresh;
- // prune extended partition search
+ // Prune extended partition search based on whether the split/rect partitions
+ // provided an improvement in the previous search.
// 0 : no pruning
// 1 : prune 1:4 partition search using winner info from split partitions
// 2 : prune 1:4 and AB partition search using split and HORZ/VERT info
@@ -822,6 +858,9 @@ typedef struct MV_SPEED_FEATURES {
// Accurate full pixel motion search based on TPL stats.
int full_pixel_search_level;
+ // Allow intrabc motion search
+ int use_intrabc;
+
// Whether to downsample the rows in sad calculation during motion search.
// This is only active when there are at least 16 rows. When this sf is
// active, if there is a large discrepancy in the SAD values for the final
@@ -900,6 +939,12 @@ typedef struct INTER_MODE_SPEED_FEATURES {
// 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames
int alt_ref_search_fp;
+ // Prune reference frames for single prediction modes based on temporal
+ // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is
+ // disabled for 0. An increasing value indicates more aggressive pruning
+ // threshold.
+ int prune_single_ref;
+
// Prune compound reference frames
// 0 no pruning
// 1 prune compound references which do not satisfy the two conditions:
@@ -1123,6 +1168,10 @@ typedef struct INTERP_FILTER_SPEED_FEATURES {
// adaptive interp_filter search to allow skip of certain filter types.
int adaptive_interp_filter_search;
+
+ // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation
+ // filter search.
+ int skip_interp_filter_search;
} INTERP_FILTER_SPEED_FEATURES;
typedef struct INTRA_MODE_SPEED_FEATURES {
@@ -1441,6 +1490,13 @@ typedef struct LOOP_FILTER_SPEED_FEATURES {
// Disable loop restoration for luma plane
int disable_loop_restoration_luma;
+ // Range of loop restoration unit sizes to search
+ // The minimum size is clamped against the superblock size in
+ // av1_pick_filter_restoration, so that the code which sets this value does
+ // not need to know the superblock size ahead of time.
+ int min_lr_unit_size;
+ int max_lr_unit_size;
+
// Prune RESTORE_WIENER evaluation based on source variance
// 0 : no pruning
// 1 : conservative pruning
@@ -1541,9 +1597,6 @@ typedef struct REAL_TIME_SPEED_FEATURES {
// Use simplified RD model for interpolation search and Intra
int use_simple_rd_model;
- // If set forces interpolation filter to EIGHTTAP_REGULAR
- int skip_interp_filter_search;
-
// For nonrd mode: use hybrid intra mode search for intra only frames based on
// block properties.
// 0 : use nonrd pick intra for all blocks
@@ -1671,6 +1724,9 @@ typedef struct REAL_TIME_SPEED_FEATURES {
// Prune the use of paletter mode in nonrd pickmode.
int prune_palette_nonrd;
+ // Force to only use dct for palette search in nonrd pickmode.
+ int dct_only_palette_nonrd;
+
// Skip loopfilter, for static content after slide change
// or key frame, once quality has ramped up.
// 0: disabled
@@ -1798,6 +1854,11 @@ typedef struct REAL_TIME_SPEED_FEATURES {
// A flag that controls if we check or bypass GLOBALMV in rtc single ref frame
// case.
bool check_globalmv_on_single_ref;
+
+ // Allows for increasing the color_threshold for palette prediction.
+ // This generally leads to better coding efficiency but with some speed loss.
+ // Only used for screen content and for nonrd_pickmode.
+ bool increase_color_thresh_palette;
} REAL_TIME_SPEED_FEATURES;
/*!\endcond */
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index 85678dccc..ae0c27612 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -33,6 +33,7 @@ void av1_init_layer_context(AV1_COMP *const cpi) {
svc->force_zero_mode_spatial_ref = 1;
svc->num_encoded_top_layer = 0;
svc->use_flexible_mode = 0;
+ svc->has_lower_quality_layer = 0;
for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
@@ -69,7 +70,7 @@ void av1_init_layer_context(AV1_COMP *const cpi) {
lc->actual_num_seg1_blocks = 0;
lc->actual_num_seg2_blocks = 0;
lc->counter_encode_maxq_scene_change = 0;
- if (lc->map) aom_free(lc->map);
+ aom_free(lc->map);
CHECK_MEM_ERROR(cm, lc->map,
aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
}
@@ -155,7 +156,7 @@ void av1_update_layer_context_change_config(AV1_COMP *const cpi,
lc->actual_num_seg1_blocks = 0;
lc->actual_num_seg2_blocks = 0;
lc->counter_encode_maxq_scene_change = 0;
- if (lc->map) aom_free(lc->map);
+ aom_free(lc->map);
CHECK_MEM_ERROR(cm, lc->map,
aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
}
@@ -215,6 +216,7 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
LAYER_CONTEXT *const lc = get_layer_context(cpi);
const int old_frame_since_key = cpi->rc.frames_since_key;
const int old_frame_to_key = cpi->rc.frames_to_key;
+ const int max_consec_drop = cpi->rc.max_consec_drop;
// Restore layer rate control.
cpi->rc = lc->rc;
cpi->ppi->p_rc = lc->p_rc;
@@ -227,6 +229,8 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
// before the layer restore. Keep these defined for the stream (not layer).
cpi->rc.frames_since_key = old_frame_since_key;
cpi->rc.frames_to_key = old_frame_to_key;
+ // Reset to value before the layer restore.
+ cpi->rc.max_consec_drop = max_consec_drop;
// For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
// for the base temporal layer.
if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
@@ -245,7 +249,8 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
// This is to skip searching mv for that reference if it was last
// refreshed (i.e., buffer slot holding that reference was refreshed) on the
// previous spatial layer(s) at the same time (current_superframe).
- if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref) {
+ if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref &&
+ cpi->sf.rt_sf.use_nonrd_pick_mode) {
if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) {
svc->skip_mvsearch_last = 1;
}
@@ -357,7 +362,8 @@ void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
LAYER_CONTEXT *const lc = &svc->layer_context[layer];
- if (lc->map) aom_free(lc->map);
+ aom_free(lc->map);
+ lc->map = NULL;
}
}
}
@@ -395,6 +401,16 @@ void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
int width = 0, height = 0;
lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
svc->temporal_layer_id];
+ // Set the lower quality layer flag.
+ svc->has_lower_quality_layer = 0;
+ if (cpi->svc.spatial_layer_id > 0) {
+ const LAYER_CONTEXT *lc_prev =
+ &svc->layer_context[(svc->spatial_layer_id - 1) *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1)
+ svc->has_lower_quality_layer = 1;
+ }
av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width,
cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num,
lc->scaling_factor_den, &width, &height);
@@ -499,7 +515,8 @@ void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
// Set all buffer_idx to 0.
// Set GOLDEN to slot 5 and update slot 5.
for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
- if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+ svc->spatial_layer_id < svc->number_spatial_layers - 1) {
rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5;
rtc_ref->refresh[5] = 1;
}
@@ -509,7 +526,8 @@ void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
// Set LAST3 to slot 6 and update slot 6.
for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5;
rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
- if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+ svc->spatial_layer_id < svc->number_spatial_layers - 1) {
rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6;
rtc_ref->refresh[6] = 1;
}
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index 3a6e0fc6e..bfde33d54 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -139,6 +139,14 @@ typedef struct SVC {
* Force zero-mv in mode search for the spatial/inter-layer reference.
*/
int force_zero_mode_spatial_ref;
+
+ /*!
+ * Flag to indicate that current spatial layer has a lower quality layer
+ * (at the same timestamp) that can be used as a reference.
+ * Lower quality layer refers to the same resolution but encoded at
+ * different/lower bitrate.
+ */
+ int has_lower_quality_layer;
} SVC;
struct AV1_COMP;
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 91a0c788e..d6ae667a8 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -9,6 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <float.h>
#include <math.h>
#include <limits.h>
@@ -33,6 +34,7 @@
#include "av1/encoder/extend.h"
#include "av1/encoder/firstpass.h"
#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/intra_mode_search_utils.h"
#include "av1/encoder/mcomp.h"
#include "av1/encoder/motion_search_facade.h"
#include "av1/encoder/pass2_strategy.h"
@@ -49,6 +51,39 @@
static void tf_determine_block_partition(const MV block_mv, const int block_mse,
MV *subblock_mvs, int *subblock_mses);
+// This function returns the minimum and maximum log variances for 4x4 sub
+// blocks in the current block.
+static INLINE void get_log_var_4x4sub_blk(
+ AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row,
+ int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min,
+ double *blk_4x4_var_max, int is_hbd) {
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ int var_min = INT_MAX;
+ int var_max = 0;
+
+ // Derive the source buffer.
+ const int src_stride = frame_to_filter->y_stride;
+ const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width;
+ const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset;
+
+ for (int i = 0; i < mb_height; i += MI_SIZE) {
+ for (int j = 0; j < mb_width; j += MI_SIZE) {
+ // Calculate the 4x4 sub-block variance.
+ const int var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf, src_buf + (i * src_stride) + j,
+ src_stride, is_hbd);
+
+ // Record min and max for over-arching block
+ var_min = AOMMIN(var_min, var);
+ var_max = AOMMAX(var_max, var);
+ }
+ }
+
+ *blk_4x4_var_min = log1p(var_min / 16.0);
+ *blk_4x4_var_max = log1p(var_max / 16.0);
+}
+
/*!\endcond */
/*!\brief Does motion search for blocks in temporal filtering. This is
* the first step for temporal filtering. More specifically, given a frame to
@@ -68,19 +103,22 @@ static void tf_determine_block_partition(const MV block_mv, const int block_mse,
* the entire block.
*
* \ingroup src_frame_proc
- * \param[in] cpi Top level encoder instance structure
- * \param[in] mb Pointer to macroblock
- * \param[in] frame_to_filter Pointer to the frame to be filtered
- * \param[in] ref_frame Pointer to the reference frame
- * \param[in] block_size Block size used for motion search
- * \param[in] mb_row Row index of the block in the frame
- * \param[in] mb_col Column index of the block in the frame
- * \param[in] ref_mv Reference motion vector, which is commonly
- * inherited from the motion search result of
- * previous frame.
- * \param[out] subblock_mvs Pointer to the motion vectors for 4 sub-blocks
- * \param[out] subblock_mses Pointer to the search errors (MSE) for 4
- * sub-blocks
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] mb Pointer to macroblock
+ * \param[in] frame_to_filter Pointer to the frame to be filtered
+ * \param[in] ref_frame Pointer to the reference frame
+ * \param[in] block_size Block size used for motion search
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] ref_mv Reference motion vector, which is commonly
+ * inherited from the motion search result of
+ * previous frame.
+ * \param[in] allow_me_for_sub_blks Flag to indicate whether motion search at
+ * 16x16 sub-block level is needed or not.
+ * \param[out] subblock_mvs Pointer to the motion vectors for
+ * 4 sub-blocks
+ * \param[out] subblock_mses Pointer to the search errors (MSE) for
+ * 4 sub-blocks
*
* \remark Nothing will be returned. Results are saved in subblock_mvs and
* subblock_mses
@@ -89,7 +127,8 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
const YV12_BUFFER_CONFIG *frame_to_filter,
const YV12_BUFFER_CONFIG *ref_frame,
const BLOCK_SIZE block_size, const int mb_row,
- const int mb_col, MV *ref_mv, MV *subblock_mvs,
+ const int mb_col, MV *ref_mv,
+ bool allow_me_for_sub_blks, MV *subblock_mvs,
int *subblock_mses) {
// Frame information
const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
@@ -99,7 +138,10 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
const int mb_width = block_size_wide[block_size];
const int mb_pels = mb_height * mb_width;
const int y_stride = frame_to_filter->y_stride;
+ const int src_width = frame_to_filter->y_width;
+ const int ref_width = ref_frame->y_width;
assert(y_stride == ref_frame->y_stride);
+ assert(src_width == ref_width);
const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
// Save input state.
@@ -127,8 +169,10 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
// Setup.
mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
mb->plane[0].src.stride = y_stride;
+ mb->plane[0].src.width = src_width;
mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
mbd->plane[0].pre[0].stride = y_stride;
+ mbd->plane[0].pre[0].width = ref_width;
const SEARCH_METHODS search_method = NSTEP;
const search_site_config *search_site_cfg =
@@ -141,14 +185,15 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
// Do motion search.
int_mv best_mv; // Searched motion vector.
+ FULLPEL_MV_STATS best_mv_stats;
int block_mse = INT_MAX;
MV block_mv = kZeroMv;
const int q = av1_get_q(cpi);
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
&baseline_mv, start_mv, search_site_cfg,
+ search_method,
/*fine_search_interval=*/0);
- av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
full_ms_params.run_mesh_search = 1;
full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
@@ -160,7 +205,7 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
av1_full_pixel_search(start_mv, &full_ms_params, step_param,
cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
- NULL);
+ &best_mv_stats, NULL);
if (force_integer_mv == 1) { // Only do full search on the entire block.
const int mv_row = best_mv.as_mv.row;
@@ -181,63 +226,66 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
// Since we are merely refining the result from full pixel search, we don't
// need regularization for subpel search
ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
error = cpi->mv_search_params.find_fractional_mv_step(
- &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
- &distortion, &sse, NULL);
+ &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv.as_mv, &distortion, &sse, NULL);
block_mse = DIVIDE_AND_ROUND(error, mb_pels);
block_mv = best_mv.as_mv;
*ref_mv = best_mv.as_mv;
- // On 4 sub-blocks.
- const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
- const int subblock_height = block_size_high[subblock_size];
- const int subblock_width = block_size_wide[subblock_size];
- const int subblock_pels = subblock_height * subblock_width;
- start_mv = get_fullmv_from_mv(ref_mv);
- int subblock_idx = 0;
- for (int i = 0; i < mb_height; i += subblock_height) {
- for (int j = 0; j < mb_width; j += subblock_width) {
- const int offset = i * y_stride + j;
- mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
- mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
- av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb,
- subblock_size, &baseline_mv,
- start_mv, search_site_cfg,
- /*fine_search_interval=*/0);
- av1_set_mv_search_method(&full_ms_params, search_site_cfg,
- search_method);
- full_ms_params.run_mesh_search = 1;
- full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
-
- if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
- // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
- full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
- full_ms_params.mesh_search_mv_diff_threshold = 2;
+ if (allow_me_for_sub_blks) {
+ // On 4 sub-blocks.
+ const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
+ const int subblock_height = block_size_high[subblock_size];
+ const int subblock_width = block_size_wide[subblock_size];
+ const int subblock_pels = subblock_height * subblock_width;
+ start_mv = get_fullmv_from_mv(ref_mv);
+
+ int subblock_idx = 0;
+ for (int i = 0; i < mb_height; i += subblock_height) {
+ for (int j = 0; j < mb_width; j += subblock_width) {
+ const int offset = i * y_stride + j;
+ mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
+ mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
+ av1_make_default_fullpel_ms_params(
+ &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv,
+ search_site_cfg, search_method,
+ /*fine_search_interval=*/0);
+ full_ms_params.run_mesh_search = 1;
+ full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+ if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+ // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+ full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+ full_ms_params.mesh_search_mv_diff_threshold = 2;
+ }
+ av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+ av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
+ &baseline_mv, cost_list);
+ ms_params.forced_stop = EIGHTH_PEL;
+ ms_params.var_params.subpel_search_type = subpel_search_type;
+ // Since we are merely refining the result from full pixel search, we
+ // don't need regularization for subpel search
+ ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
+
+ subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(
+ av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ error = cpi->mv_search_params.find_fractional_mv_step(
+ &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
+ &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL);
+ subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+ subblock_mvs[subblock_idx] = best_mv.as_mv;
+ ++subblock_idx;
}
-
- av1_full_pixel_search(start_mv, &full_ms_params, step_param,
- cond_cost_list(cpi, cost_list),
- &best_mv.as_fullmv, NULL);
-
- av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
- &baseline_mv, cost_list);
- ms_params.forced_stop = EIGHTH_PEL;
- ms_params.var_params.subpel_search_type = subpel_search_type;
- // Since we are merely refining the result from full pixel search, we
- // don't need regularization for subpel search
- ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
-
- subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
- assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
- error = cpi->mv_search_params.find_fractional_mv_step(
- &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
- &best_mv.as_mv, &distortion, &sse, NULL);
- subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
- subblock_mvs[subblock_idx] = best_mv.as_mv;
- ++subblock_idx;
}
}
}
@@ -247,9 +295,16 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
mbd->plane[0].pre[0] = ori_pre_buf;
// Make partition decision.
- tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
- subblock_mses);
-
+ if (allow_me_for_sub_blks) {
+ tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+ subblock_mses);
+ } else {
+ // Copy 32X32 block mv and mse values to sub blocks
+ for (int i = 0; i < 4; ++i) {
+ subblock_mvs[i] = block_mv;
+ subblock_mses[i] = block_mse;
+ }
+ }
// Do not pass down the reference motion vector if error is too large.
const int thresh = (min_frame_size >= 720) ? 12 : 3;
if (block_mse > (thresh << (mbd->bd - 8))) {
@@ -842,6 +897,26 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
memset(count, 0, num_pels * sizeof(count[0]));
MV ref_mv = kZeroMv; // Reference motion vector passed down along frames.
// Perform temporal filtering frame by frame.
+
+ // Decide whether to perform motion search at 16x16 sub-block level or not
+ // based on 4x4 sub-blocks source variance. Allow motion search for split
+ // partition only if the difference between max and min source variance of
+ // 4x4 blocks is greater than a threshold (which is derived empirically).
+ bool allow_me_for_sub_blks = true;
+ if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) {
+ const int is_hbd = is_frame_high_bitdepth(frame_to_filter);
+ // Initialize minimum variance to a large value and maximum variance to 0.
+ double blk_4x4_var_min = DBL_MAX;
+ double blk_4x4_var_max = 0;
+ get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col,
+ TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max,
+ is_hbd);
+ // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the
+ // threshold for high bit depth.
+ if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0)
+ allow_me_for_sub_blks = false;
+ }
+
for (int frame = 0; frame < num_frames; frame++) {
if (frames[frame] == NULL) continue;
@@ -855,7 +930,8 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
ref_mv.col *= -1;
} else { // Other reference frames.
tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
- mb_row, mb_col, &ref_mv, subblock_mvs, subblock_mses);
+ mb_row, mb_col, &ref_mv, allow_me_for_sub_blks,
+ subblock_mvs, subblock_mses);
}
// Perform weighted averaging.
@@ -887,8 +963,9 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
filter_strength, weight_calc_level_in_tf, pred, accum, count);
#if CONFIG_AV1_HIGHBITDEPTH
}
-#endif // CONFIG_AV1_HIGHBITDEPTH
- } else { // for 8-bit
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ // for 8-bit
if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
av1_apply_temporal_filter(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
@@ -1047,22 +1124,32 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
// change the number of frames for key frame filtering, which is to avoid
// visual quality drop.
int adjust_num = 6;
+ const int adjust_num_frames_for_arf_filtering =
+ cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering;
if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering.
adjust_num = 0;
} else if ((update_type == KF_UPDATE) && q <= 10) {
adjust_num = 0;
- } else if (cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering &&
- update_type != KF_UPDATE) {
+ } else if (adjust_num_frames_for_arf_filtering > 0 &&
+ update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) {
+ // Since screen content detection happens after temporal filtering,
+ // 'frames_since_key' check is added to ensure the sf is disabled for the
+ // first alt-ref frame.
// Adjust number of frames to be considered for filtering based on noise
// level of the current frame. For low-noise frame, use more frames to
// filter such that the filtered frame can provide better predictions for
// subsequent frames and vice versa.
+ const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 },
+ { 4, 2, 0 } };
+ const uint8_t *adjust_num_frames =
+ av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1];
+
if (noise_levels[AOM_PLANE_Y] < 0.5)
- adjust_num = 4;
+ adjust_num = adjust_num_frames[0];
else if (noise_levels[AOM_PLANE_Y] < 1.0)
- adjust_num = 2;
+ adjust_num = adjust_num_frames[1];
else
- adjust_num = 0;
+ adjust_num = adjust_num_frames[2];
}
num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
@@ -1166,11 +1253,11 @@ double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height,
}
#if CONFIG_AV1_HIGHBITDEPTH
-double av1_highbd_estimate_noise_from_single_plane(const uint16_t *src16,
- int height, int width,
- const int stride,
- int bit_depth,
- int edge_thresh) {
+double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16,
+ int height, int width,
+ const int stride,
+ int bit_depth,
+ int edge_thresh) {
int64_t accum = 0;
int count = 0;
for (int i = 1; i < height - 1; ++i) {
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 8aa473167..0b00c886e 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -21,9 +21,9 @@ extern "C" {
struct AV1_COMP;
struct AV1EncoderConfig;
struct ThreadData;
-// TODO(any): These two variables are only used in avx2, sse2, sse4
-// implementations, where the block size is still hard coded. This should be
-// fixed to align with the c implementation.
+// TODO(wtc): These two variables are only used in avx2, sse2, neon
+// implementations, where the block size is still hard coded to TF_BLOCK_SIZE.
+// This should be fixed to align with the c implementation.
#define BH 32
#define BW 32
@@ -261,6 +261,9 @@ typedef struct {
#endif // CONFIG_MULTITHREAD
// Next temporal filter block row to be filtered.
int next_tf_row;
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool tf_mt_exit;
} AV1TemporalFilterSync;
// Estimates noise level from a given frame using a single plane (Y, U, or V).
@@ -353,29 +356,26 @@ int av1_get_q(const struct AV1_COMP *cpi);
// num_pels: Number of pixels in the block across all planes.
// is_high_bitdepth: Whether the frame is high-bitdepth or not.
// Returns:
-// Nothing will be returned. But the contents of tf_data will be modified.
+// True if allocation is successful and false otherwise.
static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
int num_pels,
int is_high_bitdepth) {
- tf_data->tmp_mbmi = (MB_MODE_INFO *)malloc(sizeof(*tf_data->tmp_mbmi));
- memset(tf_data->tmp_mbmi, 0, sizeof(*tf_data->tmp_mbmi));
+ tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi));
tf_data->accum =
(uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
tf_data->count =
(uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count));
- memset(&tf_data->diff, 0, sizeof(tf_data->diff));
if (is_high_bitdepth)
tf_data->pred = CONVERT_TO_BYTEPTR(
aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred)));
else
tf_data->pred =
(uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred));
- if (!(tf_data->accum && tf_data->count && tf_data->pred)) {
- aom_free(tf_data->accum);
- aom_free(tf_data->count);
- aom_free(tf_data->pred);
+ // In case of an allocation failure, other successfully allocated buffers will
+ // be freed by the tf_dealloc_data() call in encoder_destroy().
+ if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred))
return false;
- }
+ memset(&tf_data->diff, 0, sizeof(tf_data->diff));
return true;
}
@@ -405,10 +405,14 @@ static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
int is_high_bitdepth) {
if (is_high_bitdepth)
tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
- free(tf_data->tmp_mbmi);
+ aom_free(tf_data->tmp_mbmi);
+ tf_data->tmp_mbmi = NULL;
aom_free(tf_data->accum);
+ tf_data->accum = NULL;
aom_free(tf_data->count);
+ tf_data->count = NULL;
aom_free(tf_data->pred);
+ tf_data->pred = NULL;
}
// Saves the state prior to temporal filter process.
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 3aeb51172..ca60e4981 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -252,13 +252,15 @@ static AOM_INLINE void txfm_quant_rdcost(
static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
uint8_t *cur_frame_buf,
uint8_t *ref_frame_buf, int stride,
- int stride_ref, BLOCK_SIZE bsize,
- MV center_mv, int_mv *best_mv) {
+ int ref_stride, int width, int ref_width,
+ BLOCK_SIZE bsize, MV center_mv,
+ int_mv *best_mv) {
AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
int step_param;
uint32_t bestsme = UINT_MAX;
+ FULLPEL_MV_STATS best_mv_stats;
int distortion;
uint32_t sse;
int cost_list[5];
@@ -267,28 +269,29 @@ static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
// Setup frame pointers
x->plane[0].src.buf = cur_frame_buf;
x->plane[0].src.stride = stride;
+ x->plane[0].src.width = width;
xd->plane[0].pre[0].buf = ref_frame_buf;
- xd->plane[0].pre[0].stride = stride_ref;
+ xd->plane[0].pre[0].stride = ref_stride;
+ xd->plane[0].pre[0].width = ref_width;
step_param = tpl_sf->reduce_first_step_size;
step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
const search_site_config *search_site_cfg =
cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
- if (search_site_cfg->stride != stride_ref)
+ if (search_site_cfg->stride != ref_stride)
search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
- assert(search_site_cfg->stride == stride_ref);
+ assert(search_site_cfg->stride == ref_stride);
FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
start_mv, search_site_cfg,
+ tpl_sf->search_method,
/*fine_search_interval=*/0);
- av1_set_mv_search_method(&full_ms_params, search_site_cfg,
- tpl_sf->search_method);
bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
cond_cost_list(cpi, cost_list),
- &best_mv->as_fullmv, NULL);
+ &best_mv->as_fullmv, &best_mv_stats, NULL);
// When sub-pel motion search is skipped, populate sub-pel precision MV and
// return.
@@ -303,11 +306,12 @@ static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
ms_params.forced_stop = tpl_sf->subpel_force_stop;
ms_params.var_params.subpel_search_type = USE_2_TAPS;
ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
bestsme = cpi->mv_search_params.find_fractional_mv_step(
- xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &distortion, &sse,
- NULL);
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv,
+ &distortion, &sse, NULL);
return bestsme;
}
@@ -451,8 +455,74 @@ static void get_rate_distortion(
}
}
+static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
+ const uint8_t *src_mb_buffer,
+ int src_stride,
+ TplBuffers *tpl_tmp_buffers,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int mi_row, int mi_col, int rf_idx,
+ MV *rfidx_mv, int use_pred_sad) {
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ TplParams *tpl_data = &cpi->ppi->tpl_data;
+ const YV12_BUFFER_CONFIG *const ref_frame_ptr =
+ tpl_data->src_ref_frame[rf_idx];
+ int16_t *src_diff = tpl_tmp_buffers->src_diff;
+ tran_low_t *coeff = tpl_tmp_buffers->coeff;
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ int32_t inter_cost;
+
+ if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) {
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+ uint8_t *predictor =
+ is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+ struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+ ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+ ref_frame_ptr->y_stride };
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+ mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+ &tpl_data->sf, &ref_buf, kernel);
+ inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+ av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv,
+ &inter_pred_params);
+
+ if (use_pred_sad) {
+ inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride,
+ predictor, bw);
+ } else {
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
+ } else {
+ int ref_mb_offset =
+ mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+ uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+ int ref_stride = ref_frame_ptr->y_stride;
+ const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv);
+ // Since sub-pel motion search is not performed, use the prediction pixels
+ // directly from the reference block ref_mb
+ if (use_pred_sad) {
+ inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride,
+ &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride);
+ } else {
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ &ref_mb[fullmv.row * ref_stride + fullmv.col],
+ ref_stride, coeff, bw, bh, tx_size);
+ }
+ }
+ return inter_cost;
+}
+
static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers,
MACROBLOCK *x, int mi_row, int mi_col,
BLOCK_SIZE bsize, TX_SIZE tx_size,
TplDepStats *tpl_stats) {
@@ -470,8 +540,6 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
const int bw = 4 << mi_size_wide_log2[bsize];
const int bh = 4 << mi_size_high_log2[bsize];
- const int_interpfilters kernel =
- av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
@@ -479,9 +547,11 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
int32_t intra_cost;
PREDICTION_MODE best_mode = DC_PRED;
- int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
- int src_stride = xd->cur_buf->y_stride;
+ const int src_stride = xd->cur_buf->y_stride;
+ const int src_width = xd->cur_buf->y_width;
int dst_mb_offset =
mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
@@ -507,29 +577,16 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
pd->subsampling_y = xd->cur_buf->subsampling_y;
}
- // Number of pixels in a tpl block
- const int tpl_block_pels = tpl_data->tpl_bsize_1d * tpl_data->tpl_bsize_1d;
- // Allocate temporary buffers used in motion estimation.
- uint8_t *predictor8 = aom_memalign(32, tpl_block_pels * 2 * sizeof(uint8_t));
- int16_t *src_diff = aom_memalign(32, tpl_block_pels * sizeof(int16_t));
- tran_low_t *coeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
- tran_low_t *qcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
- tran_low_t *dqcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+ uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+ int16_t *src_diff = tpl_tmp_buffers->src_diff;
+ tran_low_t *coeff = tpl_tmp_buffers->coeff;
+ tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff;
+ tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff;
uint8_t *predictor =
is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
int64_t recon_error = 1;
int64_t pred_error = 1;
- if (!(predictor8 && src_diff && coeff && qcoeff && dqcoeff)) {
- aom_free(predictor8);
- aom_free(src_diff);
- aom_free(coeff);
- aom_free(qcoeff);
- aom_free(dqcoeff);
- aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
- "Error allocating tpl data");
- }
-
memset(tpl_stats, 0, sizeof(*tpl_stats));
tpl_stats->ref_frame_index[0] = -1;
tpl_stats->ref_frame_index[1] = -1;
@@ -576,15 +633,32 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
dst_buffer_stride, predictor, bw, 0, 0, 0);
- intra_cost =
- tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
- predictor, bw, coeff, bw, bh, tx_size);
+ if (tpl_frame->use_pred_sad) {
+ intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride, predictor, bw);
+ } else {
+ intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
if (intra_cost < best_intra_cost) {
best_intra_cost = intra_cost;
best_mode = mode;
}
}
+ // Calculate SATD of the best intra mode if SAD was used for mode decision
+ // as best_intra_cost is used in ML model to skip intra mode evaluation.
+ if (tpl_frame->use_pred_sad) {
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0,
+ 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0,
+ 0, 0);
+ best_intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
int rate_cost = 1;
@@ -653,10 +727,11 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
}
const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx];
- int ref_mb_offset =
+ const int ref_mb_offset =
mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
- int ref_stride = ref_frame_ptr->y_stride;
+ const int ref_stride = ref_frame_ptr->y_stride;
+ const int ref_width = ref_frame_ptr->y_width;
int_mv best_rfidx_mv = { 0 };
uint32_t bestsme = UINT32_MAX;
@@ -743,9 +818,9 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
for (idx = 0; idx < refmv_count; ++idx) {
int_mv this_mv;
- uint32_t thissme = motion_estimation(cpi, x, src_mb_buffer, ref_mb,
- src_stride, ref_stride, bsize,
- center_mvs[idx].mv.as_mv, &this_mv);
+ uint32_t thissme = motion_estimation(
+ cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width,
+ ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv);
if (thissme < bestsme) {
bestsme = thissme;
@@ -756,32 +831,10 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
single_mv[rf_idx] = best_rfidx_mv;
- if (tpl_sf->subpel_force_stop != FULL_PEL) {
- struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
- ref_frame_ptr->y_width, ref_frame_ptr->y_height,
- ref_frame_ptr->y_stride };
- InterPredParams inter_pred_params;
- av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
- mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd),
- 0, &tpl_data->sf, &ref_buf, kernel);
- inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
-
- av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
- &inter_pred_params);
-
- inter_cost =
- tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
- predictor, bw, coeff, bw, bh, tx_size);
- } else {
- const FULLPEL_MV best_fullmv = get_fullmv_from_mv(&best_rfidx_mv.as_mv);
- // Since sub-pel motion search is not performed, use the prediction pixels
- // directly from the reference block ref_mb
- inter_cost = tpl_get_satd_cost(
- bd_info, src_diff, bw, src_mb_buffer, src_stride,
- &ref_mb[best_fullmv.row * ref_stride + best_fullmv.col], ref_stride,
- coeff, bw, bh, tx_size);
- }
- // Store inter cost for each ref frame
+ inter_cost = get_inter_cost(
+ cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+ mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad);
+ // Store inter cost for each ref frame. This is used to prune inter modes.
tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
if (inter_cost < best_inter_cost) {
@@ -791,6 +844,14 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
best_mv[0].as_int = best_rfidx_mv.as_int;
}
}
+ // Calculate SATD of the best inter mode if SAD was used for mode decision
+ // as best_inter_cost is used in ML model to skip intra mode evaluation.
+ if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) {
+ assert(best_rf_idx != -1);
+ best_inter_cost = get_inter_cost(
+ cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+ mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */);
+ }
if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) {
best_mode = NEWMV;
@@ -841,6 +902,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
xd->mi_row = mi_row;
xd->mi_col = mi_col;
int best_cmp_rf_idx = -1;
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) {
int rf_idx0 = comp_ref_frames[cmp_rf_idx][0];
int rf_idx1 = comp_ref_frames[cmp_rf_idx][1];
@@ -1039,13 +1102,6 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
}
}
}
-
- // Free temporary buffers.
- aom_free(predictor8);
- aom_free(src_diff);
- aom_free(coeff);
- aom_free(qcoeff);
- aom_free(dqcoeff);
}
static int round_floor(int ref_pos, int bsize_pix) {
@@ -1231,9 +1287,10 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
gf_group, cpi->sf.inter_sf.selective_ref_frame,
- cpi->sf.tpl_sf.prune_ref_frames_in_tpl, frame_idx);
+ tpl_sf->prune_ref_frames_in_tpl, frame_idx);
int gop_length = get_gop_length(gf_group);
int ref_frame_flags;
AV1_COMMON *cm = &cpi->common;
@@ -1341,18 +1398,27 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
av1_init_tpl_txfm_stats(tpl_txfm_stats);
// Initialize x->mbmi_ext when compound predictions are enabled.
- if (cpi->sf.tpl_sf.allow_compound_pred) av1_zero(x->mbmi_ext);
+ if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext);
// Set the pointer to null since mbmi is only allocated inside this function.
assert(xd->mi == &mbmi_ptr);
xd->mi = NULL;
+
+ // Tpl module is called before the setting of speed features at frame level.
+ // Thus, turning off this speed feature for key frame is done here and not
+ // integrated into the speed feature setting itself.
+ const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0;
+ tpl_frame->use_pred_sad =
+ tpl_sf->use_sad_for_mode_decision &&
+ gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->layer_depth[frame_idx] >= layer_depth_th;
}
// This function stores the motion estimation dependencies of all the blocks in
// a row
void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
- MACROBLOCK *x, int mi_row, BLOCK_SIZE bsize,
- TX_SIZE tx_size) {
+ TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+ int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) {
AV1_COMMON *const cm = &cpi->common;
MultiThreadInfo *const mt_info = &cpi->mt_info;
AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
@@ -1372,6 +1438,17 @@ void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
mi_col += mi_width, tplb_col_in_tile++) {
(*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
tplb_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+ if (mt_info->num_workers > 1) {
+ pthread_mutex_lock(tpl_row_mt->mutex_);
+ const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit;
+ pthread_mutex_unlock(tpl_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (tpl_mt_exit) return;
+ }
+#endif
+
TplDepStats tpl_stats;
// Motion estimation column boundary
@@ -1380,8 +1457,8 @@ void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
xd->mb_to_right_edge =
GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
- mode_estimation(cpi, tpl_txfm_stats, x, mi_row, mi_col, bsize, tx_size,
- &tpl_stats);
+ mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col,
+ bsize, tx_size, &tpl_stats);
// Motion flow dependency dispenser.
tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride,
@@ -1408,8 +1485,8 @@ static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
xd->mb_to_bottom_edge =
GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
- av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, x, mi_row, bsize,
- tx_size);
+ av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x,
+ mi_row, bsize, tx_size);
}
}
@@ -1703,6 +1780,34 @@ void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
}
}
+static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group,
+ int frame_idx, int gop_eval,
+ int approx_gop_eval,
+ int reduce_num_frames) {
+ // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+ // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+ // tpl stats calculation is limited to ARFs from base layer and (base+1)
+ // layer.
+ const int num_arf_layers = (gop_eval == 2) ? 3 : 2;
+ const int gop_length = get_gop_length(gf_group);
+
+ if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+ gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+ return 1;
+
+ // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+ // frames and for frames beyond gop length.
+ if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+ frame_idx >= gop_length))
+ return 1;
+
+ if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE &&
+ frame_idx < gop_length)
+ return 1;
+
+ return 0;
+}
+
int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
const EncodeFrameParams *const frame_params) {
#if CONFIG_COLLECT_COMPONENT_TIMING
@@ -1716,13 +1821,6 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
EncodeFrameParams this_frame_params = *frame_params;
TplParams *const tpl_data = &cpi->ppi->tpl_data;
int approx_gop_eval = (gop_eval > 1);
- int num_arf_layers = MAX_ARF_LAYERS;
-
- // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
- // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
- // tpl stats calculation is limited to ARFs from base layer and (base+1)
- // layer.
- if (approx_gop_eval) num_arf_layers = (gop_eval == 2) ? 3 : 2;
if (cpi->superres_mode != AOM_SUPERRES_NONE) {
assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
@@ -1751,6 +1849,12 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
av1_init_tpl_stats(tpl_data);
+ TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers;
+ if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating tpl data");
+ }
+
tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy;
tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy;
@@ -1763,20 +1867,26 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
- const int gop_length = get_gop_length(gf_group);
const int num_planes =
cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm);
+ // As tpl module is called before the setting of speed features at frame
+ // level, turning off this speed feature for the first GF group of the
+ // key-frame interval is done here.
+ int reduce_num_frames =
+ cpi->sf.tpl_sf.reduce_num_frames &&
+ gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->max_layer_depth > 2;
+ // TPL processing is skipped for frames of type LF_UPDATE when
+ // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor
+ // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the
+ // frames in the gf group on an average.
+ tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0;
+
// Backward propagation from tpl_group_frames to 1.
for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
++frame_idx) {
- if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
- gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
- continue;
-
- // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
- // frames and for frames beyond gop length.
- if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
- frame_idx >= gop_length))
+ if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+ reduce_num_frames))
continue;
init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
@@ -1806,12 +1916,8 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
for (int frame_idx = tpl_gf_group_frames - 1;
frame_idx >= cpi->gf_frame_index; --frame_idx) {
- if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
- gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
- continue;
-
- if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
- frame_idx >= gop_length))
+ if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+ reduce_num_frames))
continue;
mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
@@ -1831,6 +1937,8 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
end_timing(cpi, av1_tpl_setup_stats_time);
#endif
+ tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+
if (!approx_gop_eval) {
tpl_data->ready = 1;
}
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 36c3ae059..bcd58216c 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -24,6 +24,7 @@ struct AV1_SEQ_CODING_TOOLS;
struct EncodeFrameParams;
struct EncodeFrameInput;
struct GF_GROUP;
+struct ThreadData;
struct TPL_INFO;
#include "config/aom_config.h"
@@ -70,6 +71,13 @@ typedef struct AV1TplRowMultiThreadSync {
} AV1TplRowMultiThreadSync;
typedef struct AV1TplRowMultiThreadInfo {
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool tpl_mt_exit;
+#if CONFIG_MULTITHREAD
+ // Mutex lock object used for error handling.
+ pthread_mutex_t *mutex_;
+#endif
// Row synchronization related function pointers.
void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c);
void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c,
@@ -103,6 +111,14 @@ typedef struct TplTxfmStats {
int coeff_num;
} TplTxfmStats;
+typedef struct {
+ uint8_t *predictor8;
+ int16_t *src_diff;
+ tran_low_t *coeff;
+ tran_low_t *qcoeff;
+ tran_low_t *dqcoeff;
+} TplBuffers;
+
typedef struct TplDepStats {
int64_t srcrf_sse;
int64_t srcrf_dist;
@@ -137,6 +153,8 @@ typedef struct TplDepFrame {
int mi_cols;
int base_rdmult;
uint32_t frame_display_index;
+ // When set, SAD metric is used for intra and inter mode decision.
+ int use_pred_sad;
} TplDepFrame;
/*!\endcond */
@@ -227,6 +245,10 @@ typedef struct TplParams {
*/
int border_in_pixels;
+ /*!
+ * Factor to adjust r0 if TPL uses a subset of frames in the gf group.
+ */
+ double r0_adjust_factor;
} TplParams;
#if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
@@ -393,6 +415,45 @@ void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
CommonModeInfoParams *const mi_params, int width,
int height, int byte_alignment, int lag_in_frames);
+static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
+ aom_free(tpl_tmp_buffers->predictor8);
+ tpl_tmp_buffers->predictor8 = NULL;
+ aom_free(tpl_tmp_buffers->src_diff);
+ tpl_tmp_buffers->src_diff = NULL;
+ aom_free(tpl_tmp_buffers->coeff);
+ tpl_tmp_buffers->coeff = NULL;
+ aom_free(tpl_tmp_buffers->qcoeff);
+ tpl_tmp_buffers->qcoeff = NULL;
+ aom_free(tpl_tmp_buffers->dqcoeff);
+ tpl_tmp_buffers->dqcoeff = NULL;
+}
+
+static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers,
+ uint8_t tpl_bsize_1d) {
+ // Number of pixels in a tpl block
+ const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d;
+
+ // Allocate temporary buffers used in mode estimation.
+ tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign(
+ 32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8));
+ tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff));
+ tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff));
+ tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff));
+ tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff));
+
+ if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff &&
+ tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff &&
+ tpl_tmp_buffers->dqcoeff)) {
+ tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+ return false;
+ }
+ return true;
+}
+
/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
* group) and selects between 16 and 32 frame GOP structure.
*
@@ -424,7 +485,8 @@ void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
BLOCK_SIZE sb_size, int mi_row, int mi_col);
void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi,
- TplTxfmStats *tpl_txfm_stats, MACROBLOCK *x,
+ TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size);
/*!\brief Compute the entropy of an exponential probability distribution
diff --git a/av1/encoder/tune_butteraugli.c b/av1/encoder/tune_butteraugli.c
index 8f593739f..92fc4b2a9 100644
--- a/av1/encoder/tune_butteraugli.c
+++ b/av1/encoder/tune_butteraugli.c
@@ -220,8 +220,11 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) {
cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
cm->features.byte_alignment, 0, 0);
}
- av1_resize_and_extend_frame_nonnormative(cpi->source, resized_dst, bit_depth,
- av1_num_planes(cm));
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers during resize");
+ }
zero_img(cpi->source);
copy_img(resized_dst, cpi->source, width / resize_factor,
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 9c7c112ac..4e5ffa387 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -42,6 +42,7 @@ static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src,
// Parameters used for motion search.
FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
const SEARCH_METHODS search_method = NSTEP;
const search_site_config *search_site_cfg =
cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
@@ -64,10 +65,11 @@ static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src,
// Only do full search on the entire block.
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
&baseline_mv, *ref_mv, search_site_cfg,
+ search_method,
/*fine_search_interval=*/0);
- av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
av1_full_pixel_search(*ref_mv, &full_ms_params, step_param,
- cond_cost_list(cpi, cost_list), ref_mv, NULL);
+ cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats,
+ NULL);
// Restore input state.
mb->plane[0].src = ori_src_buf;
@@ -621,8 +623,11 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
&resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
cm->features.byte_alignment, 0, 0);
- av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
- bit_depth, av1_num_planes(cm));
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers during resize");
+ }
const int resized_y_width = resized_source.y_width;
const int resized_y_height = resized_source.y_height;
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index d6217b707..7292c0119 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -119,13 +119,10 @@ static AOM_INLINE void fetch_mb_rd_info(int n4,
*rd_stats = mb_rd_info->rd_stats;
}
-// Compute the pixel domain distortion from diff on all visible 4x4s in the
-// transform block.
-static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
- int blk_row, int blk_col,
- const BLOCK_SIZE plane_bsize,
- const BLOCK_SIZE tx_bsize,
- unsigned int *block_mse_q8) {
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8) {
int visible_rows, visible_cols;
const MACROBLOCKD *xd = &x->e_mbd;
get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -188,7 +185,7 @@ static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
const MACROBLOCKD *xd = &x->e_mbd;
const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
- *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+ *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
const int64_t mse = *dist / bw / bh;
// Normalized quantizer takes the transform upscaling factor (8 for tx size
@@ -243,7 +240,7 @@ static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
// Used to set proper context for early termination with skip = 1.
static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
- int bsize, int64_t dist) {
+ BLOCK_SIZE bsize, int64_t dist) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
const int n4 = bsize_to_num_blk(bsize);
@@ -644,7 +641,7 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
unsigned int sse;
- if (x->skip_chroma_rd && plane) continue;
+ if (plane) continue;
cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
pd->dst.stride, &sse);
@@ -2030,12 +2027,9 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
uint16_t best_eob = 0;
TX_TYPE best_tx_type = DCT_DCT;
int rate_cost = 0;
- // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
- // of the best tx_type
- DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
struct macroblock_plane *const p = &x->plane[plane];
tran_low_t *orig_dqcoeff = p->dqcoeff;
- tran_low_t *best_dqcoeff = this_dqcoeff;
+ tran_low_t *best_dqcoeff = x->dqcoeff_buf;
const int tx_type_map_idx =
plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
av1_invalid_rd_stats(best_rd_stats);
@@ -2071,8 +2065,8 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
return;
}
} else {
- block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
- txsize_to_bsize[tx_size], &block_mse_q8);
+ block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+ txsize_to_bsize[tx_size], &block_mse_q8);
assert(block_mse_q8 != UINT_MAX);
}
@@ -2080,7 +2074,7 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
uint16_t tx_mask;
// Use DCT_DCT transform for DC only block.
- if (dc_only_blk)
+ if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1)
tx_mask = 1 << DCT_DCT;
else
tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
diff --git a/av1/encoder/tx_search.h b/av1/encoder/tx_search.h
index b3689cf7d..ed95c1cd9 100644
--- a/av1/encoder/tx_search.h
+++ b/av1/encoder/tx_search.h
@@ -47,6 +47,27 @@ static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth];
}
+/*!\brief Compute the pixel domain distortion.
+ *
+ * \ingroup transform_search
+ * Compute the pixel domain distortion from diff on all visible 4x4s in the
+ * transform block.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane Plane index
+ * \param[in] blk_row Block row index
+ * \param[in] blk_col Block col index
+ * \param[in] plane_bsize Current plane block size
+ * \param[in] tx_bsize Transform size
+ * \param[in] block_mse_q8 Block mse
+ * \return An int64_t value that is the block sse.
+ */
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8);
+
int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
RD_STATS *rd_stats, int64_t ref_best_rd,
BLOCK_SIZE bs, TX_SIZE tx_size);
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 5b8f59832..5505db282 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -29,6 +29,7 @@
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/var_based_part.h"
#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/rdopt_utils.h"
// Possible values for the force_split variable while evaluating variance based
// partitioning.
@@ -1021,8 +1022,7 @@ static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
cpi->rc.high_source_sad) {
shift_lower_limit = 7;
- } else if (source_sad_nonrd >= kMedSad &&
- cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+ } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 &&
cpi->common.width * cpi->common.height >= 640 * 360) {
shift_upper_limit = 2;
shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4;
@@ -1243,6 +1243,7 @@ static AOM_INLINE void set_ref_frame_for_partition(
*y_sad = *y_sad_g;
*ref_frame_partition = GOLDEN_FRAME;
x->nonrd_prune_ref_frame_search = 0;
+ x->sb_me_partition = 0;
} else if (is_set_altref_ref_frame) {
av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
@@ -1251,6 +1252,7 @@ static AOM_INLINE void set_ref_frame_for_partition(
*y_sad = *y_sad_alt;
*ref_frame_partition = ALTREF_FRAME;
x->nonrd_prune_ref_frame_search = 0;
+ x->sb_me_partition = 0;
} else {
*ref_frame_partition = LAST_FRAME;
x->nonrd_prune_ref_frame_search =
@@ -1339,7 +1341,8 @@ static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
unsigned int *y_sad_g, unsigned int *y_sad_alt,
unsigned int *y_sad_last,
- MV_REFERENCE_FRAME *ref_frame_partition, int mi_row,
+ MV_REFERENCE_FRAME *ref_frame_partition,
+ struct scale_factors *sf_no_scale, int mi_row,
int mi_col, bool is_small_sb, bool scaled_ref_last) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
@@ -1420,9 +1423,37 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
if (est_motion == 1 || est_motion == 2) {
if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
- const MV dummy_mv = { 0, 0 };
- *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
- mi_row, mi_col, &dummy_mv);
+ // For screen only do int_pro_motion for spatial variance above
+ // threshold and motion level above LowSad.
+ if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
+ int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ int me_search_size_col =
+ is_screen ? 96 : block_size_wide[cm->seq_params->sb_size] >> 1;
+ // For screen use larger search size row motion to capture
+ // vertical scroll, which can be larger motion.
+ int me_search_size_row =
+ is_screen ? 192 : block_size_high[cm->seq_params->sb_size] >> 1;
+ unsigned int y_sad_zero;
+ *y_sad = av1_int_pro_motion_estimation(
+ cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
+ &y_sad_zero, me_search_size_col, me_search_size_row);
+ // The logic below selects whether the motion estimated in the
+ // int_pro_motion() will be used in nonrd_pickmode. Only do this
+ // for screen for now.
+ if (is_screen) {
+ unsigned int thresh_sad =
+ (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
+ if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
+ x->sb_me_partition = 1;
+ x->sb_me_mv.as_int = mi->mv[0].as_int;
+ } else {
+ x->sb_me_partition = 0;
+ // Fall back to using zero motion.
+ *y_sad = y_sad_zero;
+ mi->mv[0].as_int = 0;
+ }
+ }
+ }
}
}
@@ -1450,7 +1481,12 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
// Only calculate the predictor for non-zero MV.
if (mi->mv[0].as_int != 0) {
- set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ if (!scaled_ref_last) {
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ } else {
+ xd->block_ref_scale_factors[0] = sf_no_scale;
+ xd->block_ref_scale_factors[1] = sf_no_scale;
+ }
av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
cm->seq_params->sb_size, AOM_PLANE_Y,
num_planes - 1);
@@ -1517,8 +1553,8 @@ static AOM_INLINE bool set_force_zeromv_skip_for_sb(
uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) {
set_block_size(cpi, mi_row, mi_col, bsize);
x->force_zeromv_skip_for_sb = 1;
- if (vt2) aom_free(vt2);
- if (vt) aom_free(vt);
+ aom_free(vt2);
+ aom_free(vt);
// Partition shape is set here at SB level.
// Exit needs to happen from av1_choose_var_based_partitioning().
return true;
@@ -1558,6 +1594,9 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
NOISE_LEVEL noise_level = kLow;
bool is_zero_motion = true;
bool scaled_ref_last = false;
+ struct scale_factors sf_no_scale;
+ av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+ cm->width, cm->height);
bool is_key_frame =
(frame_is_intra_only(cm) ||
@@ -1578,7 +1617,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
// Ref frame used in partitioning.
MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
- CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
+ AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt)));
vt->split = td->vt64x64;
@@ -1645,10 +1684,19 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
}
}
+ x->source_variance = UINT_MAX;
+ // For nord_pickmode: compute source_variance, only for superblocks with
+ // some motion for now. This input can then be used to bias the partitioning
+ // or the chroma_check.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ x->content_state_sb.source_sad_nonrd > kLowSad)
+ x->source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y);
+
if (!is_key_frame) {
setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last,
- &ref_frame_partition, mi_row, mi_col, is_small_sb,
- scaled_ref_last);
+ &ref_frame_partition, &sf_no_scale, mi_row, mi_col,
+ is_small_sb, scaled_ref_last);
MB_MODE_INFO *mi = xd->mi[0];
// Use reference SB directly for zero mv.
@@ -1690,8 +1738,14 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
if (cpi->noise_estimate.enabled)
noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
- if (low_res && threshold_4x4avg < INT64_MAX)
- CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
+ if (low_res && threshold_4x4avg < INT64_MAX) {
+ vt2 = aom_malloc(sizeof(*vt2));
+ if (!vt2) {
+ aom_free(vt);
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating partition buffer vt2");
+ }
+ }
// Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
// for splits.
fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16,
@@ -1869,8 +1923,8 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
ref_frame_partition, mi_col, mi_row, is_small_sb);
}
- if (vt2) aom_free(vt2);
- if (vt) aom_free(vt);
+ aom_free(vt2);
+ aom_free(vt);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, choose_var_based_partitioning_time);
#endif
diff --git a/av1/encoder/x86/ml_avx2.c b/av1/encoder/x86/ml_avx2.c
new file mode 100644
index 000000000..643270841
--- /dev/null
+++ b/av1/encoder/x86/ml_avx2.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+#define CALC_OUTPUT_FOR_2ROWS \
+ const int index = weight_idx + (2 * i * tot_num_inputs); \
+ const __m256 weight0 = _mm256_loadu_ps(&weights[index]); \
+ const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \
+ const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); \
+ const __m256 mul1 = _mm256_mul_ps(inputs256, weight1); \
+ hadd[i] = _mm256_hadd_ps(mul0, mul1);
+
+static INLINE void nn_propagate_8to1(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ // Process one output row at a time.
+ for (int out = 0; out < num_outputs; out++) {
+ __m256 in_result = _mm256_setzero_ps();
+ float bias_val = bias[out];
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]);
+ const __m256 mul0 = _mm256_mul_ps(inputs256, weight0);
+ in_result = _mm256_add_ps(in_result, mul0);
+ }
+ const __m128 low_128 = _mm256_castps256_ps128(in_result);
+ const __m128 high_128 = _mm256_extractf128_ps(in_result, 1);
+ const __m128 sum_par_0 = _mm_add_ps(low_128, high_128);
+ const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0);
+ const __m128 sum_tot =
+ _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1);
+
+ bias_val += _mm_cvtss_f32(sum_tot);
+ if (is_clip_required) bias_val = AOMMAX(bias_val, 0);
+ output_nodes[out] = bias_val;
+ }
+}
+
+static INLINE void nn_propagate_8to4(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ __m256 hadd[2];
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 bias_reg = _mm_loadu_ps(&bias[out]);
+ __m128 in_result = _mm_setzero_ps();
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ // Process two output row at a time.
+ for (int i = 0; i < 2; i++) {
+ CALC_OUTPUT_FOR_2ROWS
+ }
+
+ const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]);
+ const __m128 low_128 = _mm256_castps256_ps128(sum_par);
+ const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1);
+ const __m128 result = _mm_add_ps(low_128, high_128);
+
+ in_result = _mm_add_ps(in_result, result);
+ }
+
+ in_result = _mm_add_ps(in_result, bias_reg);
+ if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps());
+ _mm_storeu_ps(&output_nodes[out], in_result);
+ }
+}
+
+static INLINE void nn_propagate_8to8(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ __m256 hadd[4];
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m256 bias_reg = _mm256_loadu_ps(&bias[out]);
+ __m256 in_result = _mm256_setzero_ps();
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ // Process two output rows at a time.
+ for (int i = 0; i < 4; i++) {
+ CALC_OUTPUT_FOR_2ROWS
+ }
+ const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]);
+ const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]);
+
+ __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20);
+ __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31);
+
+ __m256 result = _mm256_add_ps(ht_0, ht_1);
+ in_result = _mm256_add_ps(in_result, result);
+ }
+ in_result = _mm256_add_ps(in_result, bias_reg);
+ if (is_clip_required)
+ in_result = _mm256_max_ps(in_result, _mm256_setzero_ps());
+ _mm256_storeu_ps(&output_nodes[out], in_result);
+ }
+}
+
+static INLINE void nn_propagate_input_multiple_of_8(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ bool is_output_layer, int num_outputs, float *const output_nodes) {
+ // The saturation of output is considered for hidden layer which is not equal
+ // to final hidden layer.
+ const int is_clip_required =
+ !is_output_layer && num_inputs_to_process == tot_num_inputs;
+ if (num_outputs % 8 == 0) {
+ nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ } else if (num_outputs % 4 == 0) {
+ nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ } else {
+ nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ }
+}
+
+void av1_nn_predict_avx2(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ int buf_index = 0;
+ int num_inputs = nn_config->num_inputs;
+ assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER);
+
+ for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ bool is_output_layer = layer == nn_config->num_hidden_layers;
+ float *const output_nodes = is_output_layer ? output : &buf[buf_index][0];
+ const int num_outputs = is_output_layer
+ ? nn_config->num_outputs
+ : nn_config->num_hidden_nodes[layer];
+ assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER);
+
+ // Process input multiple of 8 using AVX2 intrinsic.
+ if (num_inputs % 8 == 0) {
+ nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias,
+ num_inputs, num_inputs, is_output_layer,
+ num_outputs, output_nodes);
+ } else {
+ // When number of inputs is not multiple of 8, use hybrid approach of AVX2
+ // and SSE3 based on the need.
+ const int in_mul_8 = num_inputs / 8;
+ const int num_inputs_to_process = in_mul_8 * 8;
+ int bias_is_considered = 0;
+ if (in_mul_8) {
+ nn_propagate_input_multiple_of_8(
+ input_nodes, layer_weights, layer_bias, num_inputs_to_process,
+ num_inputs, is_output_layer, num_outputs, output_nodes);
+ bias_is_considered = 1;
+ }
+
+ const float *out_temp = bias_is_considered ? output_nodes : layer_bias;
+ const int input_remaining = num_inputs % 8;
+ if (input_remaining % 4 == 0 && num_outputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]);
+ __m128 out_l = _mm_loadu_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to8_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &out_h, &out_l, num_inputs);
+ }
+ if (!is_output_layer) {
+ const __m128 zero = _mm_setzero_ps();
+ out_h = _mm_max_ps(out_h, zero);
+ out_l = _mm_max_ps(out_l, zero);
+ }
+ _mm_storeu_ps(&output_nodes[out + 4], out_h);
+ _mm_storeu_ps(&output_nodes[out], out_l);
+ }
+ } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 outputs = _mm_loadu_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to4_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs, num_inputs);
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ _mm_storeu_ps(&output_nodes[out], outputs);
+ }
+ } else if (input_remaining % 4 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 outputs = _mm_load1_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to1_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs);
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ output_nodes[out] = _mm_cvtss_f32(outputs);
+ }
+ } else {
+ // Use SSE instructions for scalar operations to avoid the latency
+ // of swapping between SIMD and FPU modes.
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 outputs = _mm_load1_ps(&out_temp[out]);
+ for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) {
+ __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+ __m128 weight =
+ _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+ outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight));
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ output_nodes[out] = _mm_cvtss_f32(outputs);
+ }
+ }
+ }
+ // Before processing the next layer, treat the output of current layer as
+ // input to next layer.
+ input_nodes = output_nodes;
+ num_inputs = num_outputs;
+ buf_index = 1 - buf_index;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
index ab69088dc..4748a68d3 100644
--- a/av1/encoder/x86/ml_sse3.c
+++ b/av1/encoder/x86/ml_sse3.c
@@ -11,10 +11,10 @@
#include <stdbool.h>
#include <assert.h>
-#include <pmmintrin.h>
#include "config/av1_rtcd.h"
#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
// In order to avoid the high-latency of swapping between FPU and SIMD
// operations, we keep the result in a 128-bit register even though we only
@@ -41,9 +41,9 @@ static void nn_propagate_8to1(const float *const inputs,
*output = _mm_add_ps(*output, hadd2);
}
-static void nn_propagate_4to1(const float *const inputs,
- const float *const weights,
- __m128 *const output) {
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const output) {
const __m128 inputs128 = _mm_loadu_ps(inputs);
const __m128 weights128 = _mm_loadu_ps(weights);
@@ -58,9 +58,9 @@ static void nn_propagate_4to1(const float *const inputs,
*output = _mm_add_ps(*output, hadd2);
}
-static void nn_propagate_4to4(const float *const inputs,
- const float *const weights, __m128 *const outputs,
- const int num_inputs) {
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const outputs, const int num_inputs) {
const __m128 inputs128 = _mm_loadu_ps(inputs);
__m128 hadd[2];
@@ -80,9 +80,9 @@ static void nn_propagate_4to4(const float *const inputs,
*outputs = _mm_add_ps(*outputs, hh);
}
-static void nn_propagate_4to8(const float *const inputs,
- const float *const weights, __m128 *const out_h,
- __m128 *const out_l, const int num_inputs) {
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+ const float *const weights, __m128 *const out_h,
+ __m128 *const out_l, const int num_inputs) {
const __m128 inputs128 = _mm_loadu_ps(inputs);
__m128 hadd[4];
@@ -171,9 +171,9 @@ void av1_nn_predict_sse3(const float *input_nodes,
__m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
__m128 out_l = _mm_loadu_ps(&layer_bias[out]);
for (int in = 0; in < num_inputs; in += 4) {
- nn_propagate_4to8(&input_nodes[in],
- &layer_weights[out * num_inputs + in], &out_h,
- &out_l, num_inputs);
+ av1_nn_propagate_4to8_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &out_h, &out_l, num_inputs);
}
if (!output_layer) nn_activate8(&out_h, &out_l);
_mm_storeu_ps(&output_nodes[out + 4], out_h);
@@ -194,9 +194,9 @@ void av1_nn_predict_sse3(const float *input_nodes,
for (int out = 0; out < num_outputs; out += 4) {
__m128 outputs = _mm_loadu_ps(&layer_bias[out]);
for (int in = 0; in < num_inputs; in += 4) {
- nn_propagate_4to4(&input_nodes[in],
- &layer_weights[out * num_inputs + in], &outputs,
- num_inputs);
+ av1_nn_propagate_4to4_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs, num_inputs);
}
if (!output_layer) nn_activate4(&outputs);
_mm_storeu_ps(&output_nodes[out], outputs);
@@ -215,8 +215,8 @@ void av1_nn_predict_sse3(const float *input_nodes,
for (int out = 0; out < num_outputs; out++) {
__m128 total = _mm_load1_ps(&layer_bias[out]);
for (int in = 0; in < num_inputs; in += 4) {
- nn_propagate_4to1(&input_nodes[in],
- &layer_weights[out * num_inputs + in], &total);
+ av1_nn_propagate_4to1_sse3(
+ &input_nodes[in], &layer_weights[out * num_inputs + in], &total);
}
if (!output_layer) nn_activate4(&total);
output_nodes[out] = _mm_cvtss_f32(total);
diff --git a/av1/encoder/x86/ml_sse3.h b/av1/encoder/x86/ml_sse3.h
new file mode 100644
index 000000000..f41a2474a
--- /dev/null
+++ b/av1/encoder/x86/ml_sse3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_
+#define AOM_AV1_ENCODER_X86_ML_SSE3_H_
+
+#include <pmmintrin.h>
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const output);
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const outputs, const int num_inputs);
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+ const float *const weights, __m128 *const out_h,
+ __m128 *const out_l, const int num_inputs);
+
+#endif // AOM_AV1_ENCODER_X86_ML_SSE3_H_
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index a3ec6f686..62d6e740a 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -39,6 +39,8 @@ AV1RateControlRtcConfig::AV1RateControlRtcConfig() {
undershoot_pct = overshoot_pct = 50;
max_intra_bitrate_pct = 50;
max_inter_bitrate_pct = 0;
+ frame_drop_thresh = 0;
+ max_consec_drop = 0;
framerate = 30.0;
ss_number_layers = 1;
ts_number_layers = 1;
@@ -124,7 +126,8 @@ bool AV1RateControlRTC::InitRateControl(const AV1RateControlRtcConfig &rc_cfg) {
oxcf->pass = AOM_RC_ONE_PASS;
oxcf->q_cfg.aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT;
- oxcf->rc_cfg.drop_frames_water_mark = 0;
+ oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+ rc->max_consec_drop = rc_cfg.max_consec_drop;
oxcf->tool_cfg.bit_depth = AOM_BITS_8;
oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC;
oxcf->algo_cfg.loopfilter_control = LOOPFILTER_ALL;
@@ -185,9 +188,15 @@ bool AV1RateControlRTC::UpdateRateControl(
oxcf->rc_cfg.maximum_buffer_size_ms = rc_cfg.buf_sz;
oxcf->rc_cfg.under_shoot_pct = rc_cfg.undershoot_pct;
oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct;
+ oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+ rc->max_consec_drop = rc_cfg.max_consec_drop;
oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
cpi_->framerate = rc_cfg.framerate;
+ if (rc_cfg.is_screen) {
+ cpi_->oxcf.tune_cfg.content = AOM_CONTENT_SCREEN;
+ cpi_->is_screen_content_type = 1;
+ }
cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
set_primary_rc_buffer_sizes(oxcf, cpi_->ppi);
@@ -226,7 +235,8 @@ bool AV1RateControlRTC::UpdateRateControl(
return true;
}
-void AV1RateControlRTC::ComputeQP(const AV1FrameParamsRTC &frame_params) {
+FrameDropDecision AV1RateControlRTC::ComputeQP(
+ const AV1FrameParamsRTC &frame_params) {
AV1_COMMON *const cm = &cpi_->common;
int width, height;
GF_GROUP *const gf_group = &cpi_->ppi->gf_group;
@@ -292,14 +302,25 @@ void AV1RateControlRTC::ComputeQP(const AV1FrameParamsRTC &frame_params) {
}
}
av1_rc_set_frame_target(cpi_, target, cm->width, cm->height);
-
- int bottom_index, top_index;
+ // Always drop for spatial enhancement layer if layer bandwidth is 0.
+ // Otherwise check for frame-dropping based on buffer level in
+ // av1_rc_drop_frame().
+ if ((cpi_->svc.spatial_layer_id > 0 &&
+ cpi_->oxcf.rc_cfg.target_bandwidth == 0) ||
+ av1_rc_drop_frame(cpi_)) {
+ cpi_->is_dropped_frame = true;
+ av1_rc_postencode_update_drop_frame(cpi_);
+ cpi_->frame_index_set.show_frame_count++;
+ cpi_->common.current_frame.frame_number++;
+ return FrameDropDecision::kDrop;
+ }
+ int bottom_index = 0, top_index = 0;
cpi_->common.quant_params.base_qindex =
av1_rc_pick_q_and_bounds(cpi_, cm->width, cm->height,
cpi_->gf_frame_index, &bottom_index, &top_index);
-
if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
av1_cyclic_refresh_setup(cpi_);
+ return FrameDropDecision::kOk;
}
int AV1RateControlRTC::GetQP() const {
@@ -327,12 +348,17 @@ AV1CdefInfo AV1RateControlRTC::GetCdefInfo() const {
return cdef_level;
}
-signed char *AV1RateControlRTC::GetCyclicRefreshMap() const {
- return cpi_->cyclic_refresh->map;
-}
-
-int *AV1RateControlRTC::GetDeltaQ() const {
- return cpi_->cyclic_refresh->qindex_delta;
+bool AV1RateControlRTC::GetSegmentationData(
+ AV1SegmentationData *segmentation_data) const {
+ if (cpi_->oxcf.q_cfg.aq_mode == 0) {
+ return false;
+ }
+ segmentation_data->segmentation_map = cpi_->enc_seg.map;
+ segmentation_data->segmentation_map_size =
+ cpi_->common.mi_params.mi_rows * cpi_->common.mi_params.mi_cols;
+ segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+ segmentation_data->delta_q_size = 3u;
+ return true;
}
void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
diff --git a/av1/ratectrl_rtc.h b/av1/ratectrl_rtc.h
index e96e21008..1894469dd 100644
--- a/av1/ratectrl_rtc.h
+++ b/av1/ratectrl_rtc.h
@@ -33,7 +33,7 @@ struct AV1RateControlRtcConfig {
int width;
int height;
// Flag indicating if the content is screen or not.
- bool is_screen;
+ bool is_screen = false;
// 0-63
int max_quantizer;
int min_quantizer;
@@ -45,6 +45,8 @@ struct AV1RateControlRtcConfig {
int overshoot_pct;
int max_intra_bitrate_pct;
int max_inter_bitrate_pct;
+ int frame_drop_thresh;
+ int max_consec_drop;
double framerate;
int layer_target_bitrate[kAV1MaxLayers];
int ts_rate_decimator[kAV1MaxTemporalLayers];
@@ -77,6 +79,18 @@ struct AV1CdefInfo {
int damping;
};
+struct AV1SegmentationData {
+ const uint8_t *segmentation_map;
+ size_t segmentation_map_size;
+ const int *delta_q;
+ size_t delta_q_size;
+};
+
+enum class FrameDropDecision {
+ kOk, // Frame is encoded.
+ kDrop, // Frame is dropped.
+};
+
class AV1RateControlRTC {
public:
static std::unique_ptr<AV1RateControlRTC> Create(
@@ -90,9 +104,13 @@ class AV1RateControlRTC {
AV1LoopfilterLevel GetLoopfilterLevel() const;
// GetCdefInfo() needs to be called after ComputeQP()
AV1CdefInfo GetCdefInfo() const;
- signed char *GetCyclicRefreshMap() const;
- int *GetDeltaQ() const;
- void ComputeQP(const AV1FrameParamsRTC &frame_params);
+ // Returns the segmentation map used for cyclic refresh, based on 4x4 blocks.
+ bool GetSegmentationData(AV1SegmentationData *segmentation_data) const;
+ // ComputeQP returns the QP if the frame is not dropped (kOk return),
+ // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+ // are not to be called (av1_rc_postencode_update_drop_frame is already
+ // called via ComputeQP if drop is decided).
+ FrameDropDecision ComputeQP(const AV1FrameParamsRTC &frame_params);
// Feedback to rate control with the size of current encoded frame
void PostEncodeUpdate(uint64_t encoded_frame_size);
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 5058022c7..85390d5dc 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -29,9 +29,14 @@ set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.")
set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
-# ARM feature flags.
-set_aom_detect_var(HAVE_NEON 0 "Enables NEON intrinsics optimizations.")
+# Arm/AArch64 feature flags.
+set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
set_aom_detect_var(HAVE_ARM_CRC32 0 "Enables Arm CRC32 optimizations.")
+set_aom_detect_var(HAVE_NEON_DOTPROD 0
+ "Enables Armv8.2-A Neon dotprod intrinsics optimizations.")
+set_aom_detect_var(HAVE_NEON_I8MM 0
+ "Enables Armv8.2-A Neon i8mm intrinsics optimizations.")
+set_aom_detect_var(HAVE_SVE 0 "Enables Armv8.2-A SVE intrinsics optimizations.")
# PPC feature flags.
set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.")
@@ -189,8 +194,18 @@ set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory."
set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time."
OFF)
-# ARM assembly/intrinsics flags.
-set_aom_option_var(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
+# Arm/AArch64 assembly/intrinsics flags.
+set_aom_option_var(ENABLE_NEON
+ "Enables Neon optimizations on Arm/AArch64 targets." ON)
+set_aom_option_var(ENABLE_ARM_CRC32 "Enables Arm CRC32 optimizations." ON)
+set_aom_option_var(
+ ENABLE_NEON_DOTPROD
+ "Enables Armv8.2-A Neon dotprod optimizations on AArch64 targets." ON)
+set_aom_option_var(
+ ENABLE_NEON_I8MM
+ "Enables Armv8.2-A Neon i8mm optimizations on AArch64 targets." ON)
+set_aom_option_var(ENABLE_SVE
+ "Enables Armv8.2-A SVE optimizations on AArch64 targets." ON)
# VSX intrinsics flags.
set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index aaef2c310..917e7cac5 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -13,8 +13,6 @@ if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_)
endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_
set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1)
-include(FindGit)
-include(FindPerl)
include(FindThreads)
include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
@@ -186,7 +184,9 @@ if(AOM_TARGET_CPU STREQUAL "x86" OR AOM_TARGET_CPU STREQUAL "x86_64")
string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
elseif(AOM_TARGET_CPU MATCHES "arm")
if(AOM_TARGET_SYSTEM STREQUAL "Darwin")
- set(CMAKE_ASM_COMPILER as)
+ if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+ endif()
set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
elseif(AOM_TARGET_SYSTEM STREQUAL "Windows")
if(NOT CMAKE_ASM_COMPILER)
@@ -214,7 +214,6 @@ elseif(AOM_TARGET_CPU MATCHES "arm")
endif()
if(CONFIG_ANALYZER)
- include(FindwxWidgets)
find_package(wxWidgets REQUIRED adv base core)
include(${wxWidgets_USE_FILE})
endif()
@@ -361,7 +360,7 @@ else()
# This combination has more stack overhead, so we account for it by
# providing higher stack limit than usual.
- add_c_flag_if_supported("-Wstack-usage=170000")
+ add_c_flag_if_supported("-Wstack-usage=285000")
add_cxx_flag_if_supported("-Wstack-usage=270000")
elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected.
add_c_flag_if_supported("-Wstack-usage=135000")
diff --git a/build/cmake/aom_install.cmake b/build/cmake/aom_install.cmake
index b02c7b9af..2c263e96b 100644
--- a/build/cmake/aom_install.cmake
+++ b/build/cmake/aom_install.cmake
@@ -46,12 +46,12 @@ macro(setup_aom_install_targets)
-DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+ -DCMAKE_THREAD_LIBS_INIT=${CMAKE_THREAD_LIBS_INIT}
-DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
-DCONFIG_TUNE_VMAF=${CONFIG_TUNE_VMAF}
-DCONFIG_TUNE_BUTTERAUGLI=${CONFIG_TUNE_BUTTERAUGLI}
-DCONFIG_SALIENCY_MAP=${CONFIG_SALIENCY_MAP}
-DCONFIG_TFLITE=${CONFIG_TFLITE}
- -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H}
-P
"${AOM_ROOT}/build/cmake/pkg_config.cmake"
COMMENT "Writing aom.pc"
diff --git a/build/cmake/aom_optimization.cmake b/build/cmake/aom_optimization.cmake
index 6b0c55acd..0f93228ee 100644
--- a/build/cmake/aom_optimization.cmake
+++ b/build/cmake/aom_optimization.cmake
@@ -270,7 +270,7 @@ function(add_rtcd_build_step config output source symbol)
--arch=${AOM_TARGET_CPU}
--sym=${symbol} ${AOM_RTCD_FLAGS}
--config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output}
- DEPENDS ${config}
+ DEPENDS "${AOM_ROOT}/build/cmake/rtcd.pl" ${config}
COMMENT "Generating ${output}"
WORKING_DIRECTORY ${AOM_CONFIG_DIR}
VERBATIM)
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 799a313fa..a9b7a6707 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -9,11 +9,60 @@
# can obtain it at www.aomedia.org/license/patent.
#
-if("${AOM_TARGET_CPU}" MATCHES "^arm")
+if("${AOM_TARGET_CPU}" STREQUAL "arm64")
set(AOM_ARCH_ARM 1)
- if("${AOM_TARGET_CPU}" STREQUAL "arm64")
- set(AOM_ARCH_AARCH64 1)
+ set(AOM_ARCH_AARCH64 1)
+ set(RTCD_ARCH_ARM "yes")
+
+ set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE")
+ set(AOM_ARM_CRC32_DEFAULT_FLAG "-march=armv8-a+crc")
+ set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod")
+ set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm")
+ set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve")
+
+ # Check that the compiler flag to enable each flavor is supported by the
+ # compiler. This may not be the case for new architecture features on old
+ # compiler versions.
+ foreach(flavor ${ARM64_FLAVORS})
+ if(ENABLE_${flavor} AND NOT DEFINED AOM_${flavor}_FLAG)
+ set(AOM_${flavor}_FLAG "${AOM_${flavor}_DEFAULT_FLAG}")
+ unset(FLAG_SUPPORTED)
+ check_c_compiler_flag("${AOM_${flavor}_FLAG}" FLAG_SUPPORTED)
+ if(NOT ${FLAG_SUPPORTED})
+ set(ENABLE_${flavor} 0)
+ endif()
+ endif()
+ endforeach()
+
+ # SVE requires that the Neon-SVE bridge header is also available.
+ if(ENABLE_SVE)
+ set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+ set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}")
+ aom_check_source_compiles("arm_neon_sve_bridge_available" "
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>" HAVE_SVE_HEADERS)
+ set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+ if(HAVE_SVE_HEADERS EQUAL 0)
+ set(ENABLE_SVE 0)
+ endif()
endif()
+
+ foreach(flavor ${ARM64_FLAVORS})
+ if(ENABLE_${flavor})
+ set(HAVE_${flavor} 1)
+ set(RTCD_HAVE_${flavor} "yes")
+ else()
+ set(HAVE_${flavor} 0)
+ string(TOLOWER ${flavor} flavor)
+ set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
+ endif()
+ endforeach()
+
+elseif("${AOM_TARGET_CPU}" MATCHES "^arm")
+ set(AOM_ARCH_ARM 1)
set(RTCD_ARCH_ARM "yes")
if(ENABLE_NEON)
@@ -24,18 +73,6 @@ if("${AOM_TARGET_CPU}" MATCHES "^arm")
set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
endif()
- check_c_source_compiles("
- #if !defined(__ARM_FEATURE_CRC32) || __ARM_FEATURE_CRC32 != 1
- #error \"CRC32 is unavailable.\"
- #endif
- int main(void) { return 0; }" HAVE_CRC32)
- if(HAVE_CRC32)
- set(HAVE_ARM_CRC32 1)
- else()
- set(HAVE_ARM_CRC32 0)
- set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-arm_crc32)
- endif()
-
elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
set(AOM_ARCH_PPC 1)
set(RTCD_ARCH_PPC "yes")
diff --git a/build/cmake/pkg_config.cmake b/build/cmake/pkg_config.cmake
index e8fff2e77..c4f94808a 100644
--- a/build/cmake/pkg_config.cmake
+++ b/build/cmake/pkg_config.cmake
@@ -13,7 +13,7 @@ cmake_minimum_required(VERSION 3.5)
set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX"
"CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR"
"CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME"
- "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
+ "CONFIG_MULTITHREAD")
foreach(arg ${REQUIRED_ARGS})
if("${${arg}}" STREQUAL "")
@@ -60,8 +60,9 @@ if(CONFIG_TUNE_BUTTERAUGLI)
endif()
file(APPEND "${pkgconfig_file}" "\nConflicts:\n")
file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n")
-if(CONFIG_MULTITHREAD AND HAVE_PTHREAD_H)
- file(APPEND "${pkgconfig_file}" "Libs.private: -lm -lpthread\n")
+if(CONFIG_MULTITHREAD AND CMAKE_THREAD_LIBS_INIT)
+ file(APPEND "${pkgconfig_file}"
+ "Libs.private: -lm ${CMAKE_THREAD_LIBS_INIT}\n")
else()
file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n")
endif()
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index bd3b9d534..1cf52f076 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -392,8 +392,9 @@ if ($opts{arch} eq 'x86') {
@ALL_ARCHS = filter(qw/neon/);
arm;
} elsif ($opts{arch} eq 'arm64' ) {
- @ALL_ARCHS = filter(qw/neon arm_crc32/);
- &require(@ALL_ARCHS);
+ @ALL_ARCHS = filter(qw/neon arm_crc32 neon_dotprod neon_i8mm sve/);
+ @REQUIRES = filter(qw/neon/);
+ &require(@REQUIRES);
arm;
} elsif ($opts{arch} eq 'ppc') {
@ALL_ARCHS = filter(qw/vsx/);
diff --git a/build/cmake/toolchains/android.cmake b/build/cmake/toolchains/android.cmake
index 4d38c9a4c..fb086856a 100644
--- a/build/cmake/toolchains/android.cmake
+++ b/build/cmake/toolchains/android.cmake
@@ -46,8 +46,6 @@ endif()
if(ANDROID_ABI MATCHES "^arm")
set(CMAKE_ASM_COMPILER as)
- # No runtime cpu detect for arm targets.
- set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
elseif(ANDROID_ABI MATCHES "^x86")
set(CMAKE_ASM_NASM_COMPILER yasm)
endif()
diff --git a/build/cmake/toolchains/arm-ios-common.cmake b/build/cmake/toolchains/arm-ios-common.cmake
index 62ca1155e..2c433befd 100644
--- a/build/cmake/toolchains/arm-ios-common.cmake
+++ b/build/cmake/toolchains/arm-ios-common.cmake
@@ -21,7 +21,4 @@ set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_CXX_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
-# No runtime cpu detect for arm*-ios targets.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
-
# TODO(tomfinegan): Handle bit code embedding.
diff --git a/build/cmake/toolchains/arm64-linux-clang.cmake b/build/cmake/toolchains/arm64-linux-clang.cmake
new file mode 100644
index 000000000..b4645cc09
--- /dev/null
+++ b/build/cmake/toolchains/arm64-linux-clang.cmake
@@ -0,0 +1,30 @@
+#
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+set(TRIPLE aarch64-linux-gnu)
+
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_ASM_COMPILER clang)
+set(CMAKE_ASM_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
diff --git a/build/cmake/toolchains/arm64-linux-gcc.cmake b/build/cmake/toolchains/arm64-linux-gcc.cmake
index 133a96a31..3d0dff025 100644
--- a/build/cmake/toolchains/arm64-linux-gcc.cmake
+++ b/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -38,6 +38,3 @@ set(CMAKE_SYSTEM_PROCESSOR "arm64")
# No intrinsics flag required for arm64-linux-gcc.
set(AOM_NEON_INTRIN_FLAG "")
-
-# No runtime cpu detect for arm64-linux-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/arm64-mingw-gcc.cmake b/build/cmake/toolchains/arm64-mingw-gcc.cmake
index 740042399..95b26d3ce 100644
--- a/build/cmake/toolchains/arm64-mingw-gcc.cmake
+++ b/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -34,6 +34,3 @@ endif()
if(NOT CMAKE_RANLIB)
set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
endif()
-
-# No runtime cpu detect for arm64-mingw-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/armv7-linux-gcc.cmake b/build/cmake/toolchains/armv7-linux-gcc.cmake
index 366e1985a..aa0550574 100644
--- a/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -44,6 +44,3 @@ set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
set(CMAKE_SYSTEM_PROCESSOR "armv7")
set(AOM_NEON_INTRIN_FLAG "-mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
-
-# No runtime cpu detect for armv7-linux-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/i686-linux-gcc.cmake b/build/cmake/toolchains/i686-linux-gcc.cmake
new file mode 100644
index 000000000..c4f6ab946
--- /dev/null
+++ b/build/cmake/toolchains/i686-linux-gcc.cmake
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS i686-linux-gnu-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CROSS}as)
+endif()
diff --git a/common/tools_common.c b/common/tools_common.c
index afe4619c0..4d77a1b42 100644
--- a/common/tools_common.c
+++ b/common/tools_common.c
@@ -65,8 +65,8 @@ void aom_tools_warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
void die_codec(aom_codec_ctx_t *ctx, const char *s) {
const char *detail = aom_codec_error_detail(ctx);
- printf("%s: %s\n", s, aom_codec_error(ctx));
- if (detail) printf(" %s\n", detail);
+ fprintf(stderr, "%s: %s\n", s, aom_codec_error(ctx));
+ if (detail) fprintf(stderr, " %s\n", detail);
exit(EXIT_FAILURE);
}
diff --git a/config/arm/config/aom_config.asm b/config/arm/config/aom_config.asm
index ce46e8b52..5bacc3073 100644
--- a/config/arm/config/aom_config.asm
+++ b/config/arm/config/aom_config.asm
@@ -1,5 +1,5 @@
;
-; Copyright (c) 2023, Alliance for Open Media. All rights reserved
+; Copyright (c) 2024, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -76,6 +76,8 @@ HAVE_AVX2 equ 0
HAVE_FEXCEPT equ 1
HAVE_MMX equ 0
HAVE_NEON equ 1
+HAVE_NEON_DOTPROD equ 0
+HAVE_NEON_I8MM equ 0
HAVE_PTHREAD_H equ 1
HAVE_SSE equ 0
HAVE_SSE2 equ 0
@@ -83,6 +85,7 @@ HAVE_SSE3 equ 0
HAVE_SSE4_1 equ 0
HAVE_SSE4_2 equ 0
HAVE_SSSE3 equ 0
+HAVE_SVE equ 0
HAVE_UNISTD_H equ 1
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
diff --git a/config/arm/config/aom_config.c b/config/arm/config/aom_config.c
index affe0d737..03251c98e 100644
--- a/config/arm/config/aom_config.c
+++ b/config/arm/config/aom_config.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,5 +9,5 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "aom/aom_codec.h"
-static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/armv7-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=1 -DCONFIG_AV1_HIGHBITDEPTH=1 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
+static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/armv7-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=1 -DCONFIG_AV1_HIGHBITDEPTH=1 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
const char *aom_codec_build_config(void) {return cfg;}
diff --git a/config/arm/config/aom_config.h b/config/arm/config/aom_config.h
index 661194483..0d089d50b 100644
--- a/config/arm/config/aom_config.h
+++ b/config/arm/config/aom_config.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -78,6 +78,8 @@
#define HAVE_FEXCEPT 1
#define HAVE_MMX 0
#define HAVE_NEON 1
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
#define HAVE_PTHREAD_H 1
#define HAVE_SSE 0
#define HAVE_SSE2 0
@@ -85,6 +87,7 @@
#define HAVE_SSE4_1 0
#define HAVE_SSE4_2 0
#define HAVE_SSSE3 0
+#define HAVE_SVE 0
#define HAVE_UNISTD_H 1
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
diff --git a/config/arm/config/aom_dsp_rtcd.h b/config/arm/config/aom_dsp_rtcd.h
index ad77b0455..bae3ed37a 100644
--- a/config/arm/config/aom_dsp_rtcd.h
+++ b/config/arm/config/aom_dsp_rtcd.h
@@ -39,7 +39,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *
#define aom_blend_a64_hmask aom_blend_a64_hmask_neon
void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
-#define aom_blend_a64_mask aom_blend_a64_mask_c
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
+#define aom_blend_a64_mask aom_blend_a64_mask_neon
void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
@@ -54,7 +55,8 @@ void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
#define aom_comp_mask_pred aom_comp_mask_pred_neon
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
-#define aom_compute_flow_at_point aom_compute_flow_at_point_c
+void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+#define aom_compute_flow_at_point aom_compute_flow_at_point_neon
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
@@ -376,139 +378,184 @@ void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8
#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_neon
void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_comp_avg_pred aom_dist_wtd_comp_avg_pred_c
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_comp_avg_pred aom_dist_wtd_comp_avg_pred_neon
unsigned int aom_dist_wtd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad128x128_avg aom_dist_wtd_sad128x128_avg_c
+unsigned int aom_dist_wtd_sad128x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad128x128_avg aom_dist_wtd_sad128x128_avg_neon
unsigned int aom_dist_wtd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad128x64_avg aom_dist_wtd_sad128x64_avg_c
+unsigned int aom_dist_wtd_sad128x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad128x64_avg aom_dist_wtd_sad128x64_avg_neon
unsigned int aom_dist_wtd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x16_avg aom_dist_wtd_sad16x16_avg_c
+unsigned int aom_dist_wtd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x16_avg aom_dist_wtd_sad16x16_avg_neon
unsigned int aom_dist_wtd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x32_avg aom_dist_wtd_sad16x32_avg_c
+unsigned int aom_dist_wtd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x32_avg aom_dist_wtd_sad16x32_avg_neon
unsigned int aom_dist_wtd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x4_avg aom_dist_wtd_sad16x4_avg_c
+unsigned int aom_dist_wtd_sad16x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x4_avg aom_dist_wtd_sad16x4_avg_neon
unsigned int aom_dist_wtd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x64_avg aom_dist_wtd_sad16x64_avg_c
+unsigned int aom_dist_wtd_sad16x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x64_avg aom_dist_wtd_sad16x64_avg_neon
unsigned int aom_dist_wtd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x8_avg aom_dist_wtd_sad16x8_avg_c
+unsigned int aom_dist_wtd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x8_avg aom_dist_wtd_sad16x8_avg_neon
unsigned int aom_dist_wtd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x16_avg aom_dist_wtd_sad32x16_avg_c
+unsigned int aom_dist_wtd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x16_avg aom_dist_wtd_sad32x16_avg_neon
unsigned int aom_dist_wtd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x32_avg aom_dist_wtd_sad32x32_avg_c
+unsigned int aom_dist_wtd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x32_avg aom_dist_wtd_sad32x32_avg_neon
unsigned int aom_dist_wtd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x64_avg aom_dist_wtd_sad32x64_avg_c
+unsigned int aom_dist_wtd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x64_avg aom_dist_wtd_sad32x64_avg_neon
unsigned int aom_dist_wtd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x8_avg aom_dist_wtd_sad32x8_avg_c
+unsigned int aom_dist_wtd_sad32x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x8_avg aom_dist_wtd_sad32x8_avg_neon
unsigned int aom_dist_wtd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad4x16_avg aom_dist_wtd_sad4x16_avg_c
+unsigned int aom_dist_wtd_sad4x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad4x16_avg aom_dist_wtd_sad4x16_avg_neon
unsigned int aom_dist_wtd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad4x4_avg aom_dist_wtd_sad4x4_avg_c
+unsigned int aom_dist_wtd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad4x4_avg aom_dist_wtd_sad4x4_avg_neon
unsigned int aom_dist_wtd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad4x8_avg aom_dist_wtd_sad4x8_avg_c
+unsigned int aom_dist_wtd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad4x8_avg aom_dist_wtd_sad4x8_avg_neon
unsigned int aom_dist_wtd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x128_avg aom_dist_wtd_sad64x128_avg_c
+unsigned int aom_dist_wtd_sad64x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x128_avg aom_dist_wtd_sad64x128_avg_neon
unsigned int aom_dist_wtd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x16_avg aom_dist_wtd_sad64x16_avg_c
+unsigned int aom_dist_wtd_sad64x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x16_avg aom_dist_wtd_sad64x16_avg_neon
unsigned int aom_dist_wtd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x32_avg aom_dist_wtd_sad64x32_avg_c
+unsigned int aom_dist_wtd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x32_avg aom_dist_wtd_sad64x32_avg_neon
unsigned int aom_dist_wtd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x64_avg aom_dist_wtd_sad64x64_avg_c
+unsigned int aom_dist_wtd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x64_avg aom_dist_wtd_sad64x64_avg_neon
unsigned int aom_dist_wtd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x16_avg aom_dist_wtd_sad8x16_avg_c
+unsigned int aom_dist_wtd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x16_avg aom_dist_wtd_sad8x16_avg_neon
unsigned int aom_dist_wtd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x32_avg aom_dist_wtd_sad8x32_avg_c
+unsigned int aom_dist_wtd_sad8x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x32_avg aom_dist_wtd_sad8x32_avg_neon
unsigned int aom_dist_wtd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x4_avg aom_dist_wtd_sad8x4_avg_c
+unsigned int aom_dist_wtd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x4_avg aom_dist_wtd_sad8x4_avg_neon
unsigned int aom_dist_wtd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x8_avg aom_dist_wtd_sad8x8_avg_c
+unsigned int aom_dist_wtd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x8_avg aom_dist_wtd_sad8x8_avg_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance128x128 aom_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance128x128 aom_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance128x64 aom_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance128x64 aom_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x16 aom_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x16 aom_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x32 aom_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x32 aom_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x4 aom_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x4 aom_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x64 aom_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x64 aom_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x8 aom_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x8 aom_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x16 aom_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x16 aom_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x32 aom_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x32 aom_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x64 aom_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x64 aom_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x8 aom_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x8 aom_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance4x16 aom_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance4x16 aom_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance4x4 aom_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance4x4 aom_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance4x8 aom_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance4x8 aom_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x128 aom_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x128 aom_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x16 aom_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x16 aom_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x32 aom_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x32 aom_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x64 aom_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x64 aom_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x16 aom_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x16 aom_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x32 aom_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x32 aom_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x4 aom_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x4 aom_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x8 aom_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x8 aom_dist_wtd_sub_pixel_avg_variance8x8_neon
void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
void aom_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -534,10 +581,12 @@ void aom_fft8x8_float_c(const float *input, float *temp, float *output);
#define aom_fft8x8_float aom_fft8x8_float_c
void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum);
-#define aom_get_blk_sse_sum aom_get_blk_sse_sum_c
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum);
+#define aom_get_blk_sse_sum aom_get_blk_sse_sum_neon
unsigned int aom_get_mb_ss_c(const int16_t *);
-#define aom_get_mb_ss aom_get_mb_ss_c
+unsigned int aom_get_mb_ss_neon(const int16_t *);
+#define aom_get_mb_ss aom_get_mb_ss_neon
void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
@@ -652,136 +701,180 @@ void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff, ptrdiff_t src_stride
#define aom_hadamard_lp_8x8_dual aom_hadamard_lp_8x8_dual_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance128x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance128x128 aom_highbd_10_masked_sub_pixel_variance128x128_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance128x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance128x128 aom_highbd_10_masked_sub_pixel_variance128x128_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance128x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance128x64 aom_highbd_10_masked_sub_pixel_variance128x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance128x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance128x64 aom_highbd_10_masked_sub_pixel_variance128x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x16 aom_highbd_10_masked_sub_pixel_variance16x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x16 aom_highbd_10_masked_sub_pixel_variance16x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x32 aom_highbd_10_masked_sub_pixel_variance16x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x32 aom_highbd_10_masked_sub_pixel_variance16x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x4 aom_highbd_10_masked_sub_pixel_variance16x4_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x4 aom_highbd_10_masked_sub_pixel_variance16x4_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x64 aom_highbd_10_masked_sub_pixel_variance16x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x64 aom_highbd_10_masked_sub_pixel_variance16x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x8 aom_highbd_10_masked_sub_pixel_variance16x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x8 aom_highbd_10_masked_sub_pixel_variance16x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x16 aom_highbd_10_masked_sub_pixel_variance32x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x16 aom_highbd_10_masked_sub_pixel_variance32x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x32 aom_highbd_10_masked_sub_pixel_variance32x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x32 aom_highbd_10_masked_sub_pixel_variance32x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x64 aom_highbd_10_masked_sub_pixel_variance32x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x64 aom_highbd_10_masked_sub_pixel_variance32x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x8 aom_highbd_10_masked_sub_pixel_variance32x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x8 aom_highbd_10_masked_sub_pixel_variance32x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance4x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance4x16 aom_highbd_10_masked_sub_pixel_variance4x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance4x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance4x16 aom_highbd_10_masked_sub_pixel_variance4x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance4x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance4x4 aom_highbd_10_masked_sub_pixel_variance4x4_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance4x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance4x4 aom_highbd_10_masked_sub_pixel_variance4x4_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance4x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance4x8 aom_highbd_10_masked_sub_pixel_variance4x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance4x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance4x8 aom_highbd_10_masked_sub_pixel_variance4x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x128 aom_highbd_10_masked_sub_pixel_variance64x128_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x128 aom_highbd_10_masked_sub_pixel_variance64x128_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x16 aom_highbd_10_masked_sub_pixel_variance64x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x16 aom_highbd_10_masked_sub_pixel_variance64x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x32 aom_highbd_10_masked_sub_pixel_variance64x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x32 aom_highbd_10_masked_sub_pixel_variance64x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x64 aom_highbd_10_masked_sub_pixel_variance64x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x64 aom_highbd_10_masked_sub_pixel_variance64x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x16 aom_highbd_10_masked_sub_pixel_variance8x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x16 aom_highbd_10_masked_sub_pixel_variance8x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x32 aom_highbd_10_masked_sub_pixel_variance8x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x32 aom_highbd_10_masked_sub_pixel_variance8x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x4 aom_highbd_10_masked_sub_pixel_variance8x4_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x4 aom_highbd_10_masked_sub_pixel_variance8x4_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x8 aom_highbd_10_masked_sub_pixel_variance8x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x8 aom_highbd_10_masked_sub_pixel_variance8x8_neon
unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -800,497 +893,620 @@ unsigned int aom_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int source_strid
#define aom_highbd_10_mse8x8 aom_highbd_10_mse8x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance128x128 aom_highbd_10_obmc_sub_pixel_variance128x128_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance128x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance128x128 aom_highbd_10_obmc_sub_pixel_variance128x128_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance128x64 aom_highbd_10_obmc_sub_pixel_variance128x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance128x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance128x64 aom_highbd_10_obmc_sub_pixel_variance128x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x16 aom_highbd_10_obmc_sub_pixel_variance16x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x16 aom_highbd_10_obmc_sub_pixel_variance16x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x32 aom_highbd_10_obmc_sub_pixel_variance16x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x32 aom_highbd_10_obmc_sub_pixel_variance16x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x4 aom_highbd_10_obmc_sub_pixel_variance16x4_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x4 aom_highbd_10_obmc_sub_pixel_variance16x4_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x64 aom_highbd_10_obmc_sub_pixel_variance16x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x64 aom_highbd_10_obmc_sub_pixel_variance16x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x8 aom_highbd_10_obmc_sub_pixel_variance16x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x8 aom_highbd_10_obmc_sub_pixel_variance16x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x16 aom_highbd_10_obmc_sub_pixel_variance32x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x16 aom_highbd_10_obmc_sub_pixel_variance32x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x32 aom_highbd_10_obmc_sub_pixel_variance32x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x32 aom_highbd_10_obmc_sub_pixel_variance32x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x64 aom_highbd_10_obmc_sub_pixel_variance32x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x64 aom_highbd_10_obmc_sub_pixel_variance32x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x8 aom_highbd_10_obmc_sub_pixel_variance32x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x8 aom_highbd_10_obmc_sub_pixel_variance32x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance4x16 aom_highbd_10_obmc_sub_pixel_variance4x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance4x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance4x16 aom_highbd_10_obmc_sub_pixel_variance4x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance4x4 aom_highbd_10_obmc_sub_pixel_variance4x4_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance4x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance4x4 aom_highbd_10_obmc_sub_pixel_variance4x4_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance4x8 aom_highbd_10_obmc_sub_pixel_variance4x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance4x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance4x8 aom_highbd_10_obmc_sub_pixel_variance4x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x128 aom_highbd_10_obmc_sub_pixel_variance64x128_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x128 aom_highbd_10_obmc_sub_pixel_variance64x128_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x16 aom_highbd_10_obmc_sub_pixel_variance64x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x16 aom_highbd_10_obmc_sub_pixel_variance64x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x32 aom_highbd_10_obmc_sub_pixel_variance64x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x32 aom_highbd_10_obmc_sub_pixel_variance64x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x64 aom_highbd_10_obmc_sub_pixel_variance64x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x64 aom_highbd_10_obmc_sub_pixel_variance64x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x16 aom_highbd_10_obmc_sub_pixel_variance8x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x16 aom_highbd_10_obmc_sub_pixel_variance8x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x32 aom_highbd_10_obmc_sub_pixel_variance8x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x32 aom_highbd_10_obmc_sub_pixel_variance8x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x4 aom_highbd_10_obmc_sub_pixel_variance8x4_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x4 aom_highbd_10_obmc_sub_pixel_variance8x4_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x8 aom_highbd_10_obmc_sub_pixel_variance8x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x8 aom_highbd_10_obmc_sub_pixel_variance8x8_neon
unsigned int aom_highbd_10_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance128x128 aom_highbd_10_obmc_variance128x128_c
+unsigned int aom_highbd_10_obmc_variance128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance128x128 aom_highbd_10_obmc_variance128x128_neon
unsigned int aom_highbd_10_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance128x64 aom_highbd_10_obmc_variance128x64_c
+unsigned int aom_highbd_10_obmc_variance128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance128x64 aom_highbd_10_obmc_variance128x64_neon
unsigned int aom_highbd_10_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x16 aom_highbd_10_obmc_variance16x16_c
+unsigned int aom_highbd_10_obmc_variance16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x16 aom_highbd_10_obmc_variance16x16_neon
unsigned int aom_highbd_10_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x32 aom_highbd_10_obmc_variance16x32_c
+unsigned int aom_highbd_10_obmc_variance16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x32 aom_highbd_10_obmc_variance16x32_neon
unsigned int aom_highbd_10_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x4 aom_highbd_10_obmc_variance16x4_c
+unsigned int aom_highbd_10_obmc_variance16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x4 aom_highbd_10_obmc_variance16x4_neon
unsigned int aom_highbd_10_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x64 aom_highbd_10_obmc_variance16x64_c
+unsigned int aom_highbd_10_obmc_variance16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x64 aom_highbd_10_obmc_variance16x64_neon
unsigned int aom_highbd_10_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x8 aom_highbd_10_obmc_variance16x8_c
+unsigned int aom_highbd_10_obmc_variance16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x8 aom_highbd_10_obmc_variance16x8_neon
unsigned int aom_highbd_10_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x16 aom_highbd_10_obmc_variance32x16_c
+unsigned int aom_highbd_10_obmc_variance32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x16 aom_highbd_10_obmc_variance32x16_neon
unsigned int aom_highbd_10_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x32 aom_highbd_10_obmc_variance32x32_c
+unsigned int aom_highbd_10_obmc_variance32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x32 aom_highbd_10_obmc_variance32x32_neon
unsigned int aom_highbd_10_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x64 aom_highbd_10_obmc_variance32x64_c
+unsigned int aom_highbd_10_obmc_variance32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x64 aom_highbd_10_obmc_variance32x64_neon
unsigned int aom_highbd_10_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x8 aom_highbd_10_obmc_variance32x8_c
+unsigned int aom_highbd_10_obmc_variance32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x8 aom_highbd_10_obmc_variance32x8_neon
unsigned int aom_highbd_10_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance4x16 aom_highbd_10_obmc_variance4x16_c
+unsigned int aom_highbd_10_obmc_variance4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance4x16 aom_highbd_10_obmc_variance4x16_neon
unsigned int aom_highbd_10_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance4x4 aom_highbd_10_obmc_variance4x4_c
+unsigned int aom_highbd_10_obmc_variance4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance4x4 aom_highbd_10_obmc_variance4x4_neon
unsigned int aom_highbd_10_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance4x8 aom_highbd_10_obmc_variance4x8_c
+unsigned int aom_highbd_10_obmc_variance4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance4x8 aom_highbd_10_obmc_variance4x8_neon
unsigned int aom_highbd_10_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x128 aom_highbd_10_obmc_variance64x128_c
+unsigned int aom_highbd_10_obmc_variance64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x128 aom_highbd_10_obmc_variance64x128_neon
unsigned int aom_highbd_10_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x16 aom_highbd_10_obmc_variance64x16_c
+unsigned int aom_highbd_10_obmc_variance64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x16 aom_highbd_10_obmc_variance64x16_neon
unsigned int aom_highbd_10_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x32 aom_highbd_10_obmc_variance64x32_c
+unsigned int aom_highbd_10_obmc_variance64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x32 aom_highbd_10_obmc_variance64x32_neon
unsigned int aom_highbd_10_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x64 aom_highbd_10_obmc_variance64x64_c
+unsigned int aom_highbd_10_obmc_variance64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x64 aom_highbd_10_obmc_variance64x64_neon
unsigned int aom_highbd_10_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x16 aom_highbd_10_obmc_variance8x16_c
+unsigned int aom_highbd_10_obmc_variance8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x16 aom_highbd_10_obmc_variance8x16_neon
unsigned int aom_highbd_10_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x32 aom_highbd_10_obmc_variance8x32_c
+unsigned int aom_highbd_10_obmc_variance8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x32 aom_highbd_10_obmc_variance8x32_neon
unsigned int aom_highbd_10_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x4 aom_highbd_10_obmc_variance8x4_c
+unsigned int aom_highbd_10_obmc_variance8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x4 aom_highbd_10_obmc_variance8x4_neon
unsigned int aom_highbd_10_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x8 aom_highbd_10_obmc_variance8x8_c
+unsigned int aom_highbd_10_obmc_variance8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x8 aom_highbd_10_obmc_variance8x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance128x128 aom_highbd_10_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance128x128 aom_highbd_10_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance128x64 aom_highbd_10_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance128x64 aom_highbd_10_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x16 aom_highbd_10_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x16 aom_highbd_10_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x32 aom_highbd_10_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x32 aom_highbd_10_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x4 aom_highbd_10_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x4 aom_highbd_10_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x64 aom_highbd_10_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x64 aom_highbd_10_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x8 aom_highbd_10_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x8 aom_highbd_10_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x16 aom_highbd_10_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x16 aom_highbd_10_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x32 aom_highbd_10_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x32 aom_highbd_10_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x64 aom_highbd_10_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x64 aom_highbd_10_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x8 aom_highbd_10_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x8 aom_highbd_10_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance4x16 aom_highbd_10_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance4x16 aom_highbd_10_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance4x4 aom_highbd_10_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance4x4 aom_highbd_10_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance4x8 aom_highbd_10_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance4x8 aom_highbd_10_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x128 aom_highbd_10_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x128 aom_highbd_10_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x16 aom_highbd_10_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x16 aom_highbd_10_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x32 aom_highbd_10_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x32 aom_highbd_10_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x64 aom_highbd_10_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x64 aom_highbd_10_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x16 aom_highbd_10_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x16 aom_highbd_10_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x32 aom_highbd_10_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x32 aom_highbd_10_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x4 aom_highbd_10_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x4 aom_highbd_10_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x8 aom_highbd_10_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x8 aom_highbd_10_sub_pixel_avg_variance8x8_neon
uint32_t aom_highbd_10_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance128x128 aom_highbd_10_sub_pixel_variance128x128_c
+uint32_t aom_highbd_10_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance128x128 aom_highbd_10_sub_pixel_variance128x128_neon
uint32_t aom_highbd_10_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance128x64 aom_highbd_10_sub_pixel_variance128x64_c
+uint32_t aom_highbd_10_sub_pixel_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance128x64 aom_highbd_10_sub_pixel_variance128x64_neon
uint32_t aom_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x16 aom_highbd_10_sub_pixel_variance16x16_c
+uint32_t aom_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x16 aom_highbd_10_sub_pixel_variance16x16_neon
uint32_t aom_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x32 aom_highbd_10_sub_pixel_variance16x32_c
+uint32_t aom_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x32 aom_highbd_10_sub_pixel_variance16x32_neon
uint32_t aom_highbd_10_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x4 aom_highbd_10_sub_pixel_variance16x4_c
+uint32_t aom_highbd_10_sub_pixel_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x4 aom_highbd_10_sub_pixel_variance16x4_neon
uint32_t aom_highbd_10_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x64 aom_highbd_10_sub_pixel_variance16x64_c
+uint32_t aom_highbd_10_sub_pixel_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x64 aom_highbd_10_sub_pixel_variance16x64_neon
uint32_t aom_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x8 aom_highbd_10_sub_pixel_variance16x8_c
+uint32_t aom_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x8 aom_highbd_10_sub_pixel_variance16x8_neon
uint32_t aom_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x16 aom_highbd_10_sub_pixel_variance32x16_c
+uint32_t aom_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x16 aom_highbd_10_sub_pixel_variance32x16_neon
uint32_t aom_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x32 aom_highbd_10_sub_pixel_variance32x32_c
+uint32_t aom_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x32 aom_highbd_10_sub_pixel_variance32x32_neon
uint32_t aom_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x64 aom_highbd_10_sub_pixel_variance32x64_c
+uint32_t aom_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x64 aom_highbd_10_sub_pixel_variance32x64_neon
uint32_t aom_highbd_10_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x8 aom_highbd_10_sub_pixel_variance32x8_c
+uint32_t aom_highbd_10_sub_pixel_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x8 aom_highbd_10_sub_pixel_variance32x8_neon
uint32_t aom_highbd_10_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance4x16 aom_highbd_10_sub_pixel_variance4x16_c
+uint32_t aom_highbd_10_sub_pixel_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance4x16 aom_highbd_10_sub_pixel_variance4x16_neon
uint32_t aom_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance4x4 aom_highbd_10_sub_pixel_variance4x4_c
+uint32_t aom_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance4x4 aom_highbd_10_sub_pixel_variance4x4_neon
uint32_t aom_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance4x8 aom_highbd_10_sub_pixel_variance4x8_c
+uint32_t aom_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance4x8 aom_highbd_10_sub_pixel_variance4x8_neon
uint32_t aom_highbd_10_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x128 aom_highbd_10_sub_pixel_variance64x128_c
+uint32_t aom_highbd_10_sub_pixel_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x128 aom_highbd_10_sub_pixel_variance64x128_neon
uint32_t aom_highbd_10_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x16 aom_highbd_10_sub_pixel_variance64x16_c
+uint32_t aom_highbd_10_sub_pixel_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x16 aom_highbd_10_sub_pixel_variance64x16_neon
uint32_t aom_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x32 aom_highbd_10_sub_pixel_variance64x32_c
+uint32_t aom_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x32 aom_highbd_10_sub_pixel_variance64x32_neon
uint32_t aom_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x64 aom_highbd_10_sub_pixel_variance64x64_c
+uint32_t aom_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x64 aom_highbd_10_sub_pixel_variance64x64_neon
uint32_t aom_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x16 aom_highbd_10_sub_pixel_variance8x16_c
+uint32_t aom_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x16 aom_highbd_10_sub_pixel_variance8x16_neon
uint32_t aom_highbd_10_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x32 aom_highbd_10_sub_pixel_variance8x32_c
+uint32_t aom_highbd_10_sub_pixel_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x32 aom_highbd_10_sub_pixel_variance8x32_neon
uint32_t aom_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x4 aom_highbd_10_sub_pixel_variance8x4_c
+uint32_t aom_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x4 aom_highbd_10_sub_pixel_variance8x4_neon
uint32_t aom_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x8 aom_highbd_10_sub_pixel_variance8x8_c
+uint32_t aom_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x8 aom_highbd_10_sub_pixel_variance8x8_neon
-unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x128 aom_highbd_10_variance128x128_neon
-unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x64 aom_highbd_10_variance128x64_neon
-unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x16 aom_highbd_10_variance16x16_neon
-unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x32 aom_highbd_10_variance16x32_neon
-unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x4 aom_highbd_10_variance16x4_neon
-unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x64 aom_highbd_10_variance16x64_neon
-unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x8 aom_highbd_10_variance16x8_neon
-unsigned int aom_highbd_10_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x2 aom_highbd_10_variance2x2_c
-
-unsigned int aom_highbd_10_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x4 aom_highbd_10_variance2x4_c
-
-unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x16 aom_highbd_10_variance32x16_neon
-unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x32 aom_highbd_10_variance32x32_neon
-unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x64 aom_highbd_10_variance32x64_neon
-unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x8 aom_highbd_10_variance32x8_neon
-unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x16 aom_highbd_10_variance4x16_neon
-unsigned int aom_highbd_10_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance4x2 aom_highbd_10_variance4x2_c
-
-unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x4 aom_highbd_10_variance4x4_neon
-unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x8 aom_highbd_10_variance4x8_neon
-unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x128 aom_highbd_10_variance64x128_neon
-unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x16 aom_highbd_10_variance64x16_neon
-unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x32 aom_highbd_10_variance64x32_neon
-unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x64 aom_highbd_10_variance64x64_neon
-unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x16 aom_highbd_10_variance8x16_neon
-unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x32 aom_highbd_10_variance8x32_neon
-unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x4 aom_highbd_10_variance8x4_neon
-unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x8 aom_highbd_10_variance8x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance128x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance128x128 aom_highbd_12_masked_sub_pixel_variance128x128_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance128x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance128x128 aom_highbd_12_masked_sub_pixel_variance128x128_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance128x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance128x64 aom_highbd_12_masked_sub_pixel_variance128x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance128x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance128x64 aom_highbd_12_masked_sub_pixel_variance128x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x16 aom_highbd_12_masked_sub_pixel_variance16x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x16 aom_highbd_12_masked_sub_pixel_variance16x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x32 aom_highbd_12_masked_sub_pixel_variance16x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x32 aom_highbd_12_masked_sub_pixel_variance16x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x4 aom_highbd_12_masked_sub_pixel_variance16x4_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x4 aom_highbd_12_masked_sub_pixel_variance16x4_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x64 aom_highbd_12_masked_sub_pixel_variance16x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x64 aom_highbd_12_masked_sub_pixel_variance16x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x8 aom_highbd_12_masked_sub_pixel_variance16x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x8 aom_highbd_12_masked_sub_pixel_variance16x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x16 aom_highbd_12_masked_sub_pixel_variance32x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x16 aom_highbd_12_masked_sub_pixel_variance32x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x32 aom_highbd_12_masked_sub_pixel_variance32x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x32 aom_highbd_12_masked_sub_pixel_variance32x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x64 aom_highbd_12_masked_sub_pixel_variance32x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x64 aom_highbd_12_masked_sub_pixel_variance32x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x8 aom_highbd_12_masked_sub_pixel_variance32x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x8 aom_highbd_12_masked_sub_pixel_variance32x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance4x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance4x16 aom_highbd_12_masked_sub_pixel_variance4x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance4x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance4x16 aom_highbd_12_masked_sub_pixel_variance4x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance4x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance4x4 aom_highbd_12_masked_sub_pixel_variance4x4_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance4x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance4x4 aom_highbd_12_masked_sub_pixel_variance4x4_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance4x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance4x8 aom_highbd_12_masked_sub_pixel_variance4x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance4x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance4x8 aom_highbd_12_masked_sub_pixel_variance4x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x128 aom_highbd_12_masked_sub_pixel_variance64x128_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x128 aom_highbd_12_masked_sub_pixel_variance64x128_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x16 aom_highbd_12_masked_sub_pixel_variance64x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x16 aom_highbd_12_masked_sub_pixel_variance64x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x32 aom_highbd_12_masked_sub_pixel_variance64x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x32 aom_highbd_12_masked_sub_pixel_variance64x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x64 aom_highbd_12_masked_sub_pixel_variance64x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x64 aom_highbd_12_masked_sub_pixel_variance64x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x16 aom_highbd_12_masked_sub_pixel_variance8x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x16 aom_highbd_12_masked_sub_pixel_variance8x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x32 aom_highbd_12_masked_sub_pixel_variance8x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x32 aom_highbd_12_masked_sub_pixel_variance8x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x4 aom_highbd_12_masked_sub_pixel_variance8x4_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x4 aom_highbd_12_masked_sub_pixel_variance8x4_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x8 aom_highbd_12_masked_sub_pixel_variance8x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x8 aom_highbd_12_masked_sub_pixel_variance8x8_neon
unsigned int aom_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -1309,497 +1525,620 @@ unsigned int aom_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int source_strid
#define aom_highbd_12_mse8x8 aom_highbd_12_mse8x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance128x128 aom_highbd_12_obmc_sub_pixel_variance128x128_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance128x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance128x128 aom_highbd_12_obmc_sub_pixel_variance128x128_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance128x64 aom_highbd_12_obmc_sub_pixel_variance128x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance128x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance128x64 aom_highbd_12_obmc_sub_pixel_variance128x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x16 aom_highbd_12_obmc_sub_pixel_variance16x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x16 aom_highbd_12_obmc_sub_pixel_variance16x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x32 aom_highbd_12_obmc_sub_pixel_variance16x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x32 aom_highbd_12_obmc_sub_pixel_variance16x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x4 aom_highbd_12_obmc_sub_pixel_variance16x4_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x4 aom_highbd_12_obmc_sub_pixel_variance16x4_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x64 aom_highbd_12_obmc_sub_pixel_variance16x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x64 aom_highbd_12_obmc_sub_pixel_variance16x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x8 aom_highbd_12_obmc_sub_pixel_variance16x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x8 aom_highbd_12_obmc_sub_pixel_variance16x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x16 aom_highbd_12_obmc_sub_pixel_variance32x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x16 aom_highbd_12_obmc_sub_pixel_variance32x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x32 aom_highbd_12_obmc_sub_pixel_variance32x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x32 aom_highbd_12_obmc_sub_pixel_variance32x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x64 aom_highbd_12_obmc_sub_pixel_variance32x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x64 aom_highbd_12_obmc_sub_pixel_variance32x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x8 aom_highbd_12_obmc_sub_pixel_variance32x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x8 aom_highbd_12_obmc_sub_pixel_variance32x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance4x16 aom_highbd_12_obmc_sub_pixel_variance4x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance4x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance4x16 aom_highbd_12_obmc_sub_pixel_variance4x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance4x4 aom_highbd_12_obmc_sub_pixel_variance4x4_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance4x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance4x4 aom_highbd_12_obmc_sub_pixel_variance4x4_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance4x8 aom_highbd_12_obmc_sub_pixel_variance4x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance4x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance4x8 aom_highbd_12_obmc_sub_pixel_variance4x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x128 aom_highbd_12_obmc_sub_pixel_variance64x128_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x128 aom_highbd_12_obmc_sub_pixel_variance64x128_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x16 aom_highbd_12_obmc_sub_pixel_variance64x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x16 aom_highbd_12_obmc_sub_pixel_variance64x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x32 aom_highbd_12_obmc_sub_pixel_variance64x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x32 aom_highbd_12_obmc_sub_pixel_variance64x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x64 aom_highbd_12_obmc_sub_pixel_variance64x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x64 aom_highbd_12_obmc_sub_pixel_variance64x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x16 aom_highbd_12_obmc_sub_pixel_variance8x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x16 aom_highbd_12_obmc_sub_pixel_variance8x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x32 aom_highbd_12_obmc_sub_pixel_variance8x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x32 aom_highbd_12_obmc_sub_pixel_variance8x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x4 aom_highbd_12_obmc_sub_pixel_variance8x4_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x4 aom_highbd_12_obmc_sub_pixel_variance8x4_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x8 aom_highbd_12_obmc_sub_pixel_variance8x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x8 aom_highbd_12_obmc_sub_pixel_variance8x8_neon
unsigned int aom_highbd_12_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance128x128 aom_highbd_12_obmc_variance128x128_c
+unsigned int aom_highbd_12_obmc_variance128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance128x128 aom_highbd_12_obmc_variance128x128_neon
unsigned int aom_highbd_12_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance128x64 aom_highbd_12_obmc_variance128x64_c
+unsigned int aom_highbd_12_obmc_variance128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance128x64 aom_highbd_12_obmc_variance128x64_neon
unsigned int aom_highbd_12_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x16 aom_highbd_12_obmc_variance16x16_c
+unsigned int aom_highbd_12_obmc_variance16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x16 aom_highbd_12_obmc_variance16x16_neon
unsigned int aom_highbd_12_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x32 aom_highbd_12_obmc_variance16x32_c
+unsigned int aom_highbd_12_obmc_variance16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x32 aom_highbd_12_obmc_variance16x32_neon
unsigned int aom_highbd_12_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x4 aom_highbd_12_obmc_variance16x4_c
+unsigned int aom_highbd_12_obmc_variance16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x4 aom_highbd_12_obmc_variance16x4_neon
unsigned int aom_highbd_12_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x64 aom_highbd_12_obmc_variance16x64_c
+unsigned int aom_highbd_12_obmc_variance16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x64 aom_highbd_12_obmc_variance16x64_neon
unsigned int aom_highbd_12_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x8 aom_highbd_12_obmc_variance16x8_c
+unsigned int aom_highbd_12_obmc_variance16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x8 aom_highbd_12_obmc_variance16x8_neon
unsigned int aom_highbd_12_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x16 aom_highbd_12_obmc_variance32x16_c
+unsigned int aom_highbd_12_obmc_variance32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x16 aom_highbd_12_obmc_variance32x16_neon
unsigned int aom_highbd_12_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x32 aom_highbd_12_obmc_variance32x32_c
+unsigned int aom_highbd_12_obmc_variance32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x32 aom_highbd_12_obmc_variance32x32_neon
unsigned int aom_highbd_12_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x64 aom_highbd_12_obmc_variance32x64_c
+unsigned int aom_highbd_12_obmc_variance32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x64 aom_highbd_12_obmc_variance32x64_neon
unsigned int aom_highbd_12_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x8 aom_highbd_12_obmc_variance32x8_c
+unsigned int aom_highbd_12_obmc_variance32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x8 aom_highbd_12_obmc_variance32x8_neon
unsigned int aom_highbd_12_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance4x16 aom_highbd_12_obmc_variance4x16_c
+unsigned int aom_highbd_12_obmc_variance4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance4x16 aom_highbd_12_obmc_variance4x16_neon
unsigned int aom_highbd_12_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance4x4 aom_highbd_12_obmc_variance4x4_c
+unsigned int aom_highbd_12_obmc_variance4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance4x4 aom_highbd_12_obmc_variance4x4_neon
unsigned int aom_highbd_12_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance4x8 aom_highbd_12_obmc_variance4x8_c
+unsigned int aom_highbd_12_obmc_variance4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance4x8 aom_highbd_12_obmc_variance4x8_neon
unsigned int aom_highbd_12_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x128 aom_highbd_12_obmc_variance64x128_c
+unsigned int aom_highbd_12_obmc_variance64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x128 aom_highbd_12_obmc_variance64x128_neon
unsigned int aom_highbd_12_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x16 aom_highbd_12_obmc_variance64x16_c
+unsigned int aom_highbd_12_obmc_variance64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x16 aom_highbd_12_obmc_variance64x16_neon
unsigned int aom_highbd_12_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x32 aom_highbd_12_obmc_variance64x32_c
+unsigned int aom_highbd_12_obmc_variance64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x32 aom_highbd_12_obmc_variance64x32_neon
unsigned int aom_highbd_12_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x64 aom_highbd_12_obmc_variance64x64_c
+unsigned int aom_highbd_12_obmc_variance64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x64 aom_highbd_12_obmc_variance64x64_neon
unsigned int aom_highbd_12_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x16 aom_highbd_12_obmc_variance8x16_c
+unsigned int aom_highbd_12_obmc_variance8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x16 aom_highbd_12_obmc_variance8x16_neon
unsigned int aom_highbd_12_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x32 aom_highbd_12_obmc_variance8x32_c
+unsigned int aom_highbd_12_obmc_variance8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x32 aom_highbd_12_obmc_variance8x32_neon
unsigned int aom_highbd_12_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x4 aom_highbd_12_obmc_variance8x4_c
+unsigned int aom_highbd_12_obmc_variance8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x4 aom_highbd_12_obmc_variance8x4_neon
unsigned int aom_highbd_12_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x8 aom_highbd_12_obmc_variance8x8_c
+unsigned int aom_highbd_12_obmc_variance8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x8 aom_highbd_12_obmc_variance8x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance128x128 aom_highbd_12_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance128x128 aom_highbd_12_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance128x64 aom_highbd_12_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance128x64 aom_highbd_12_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x16 aom_highbd_12_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x16 aom_highbd_12_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x32 aom_highbd_12_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x32 aom_highbd_12_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x4 aom_highbd_12_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x4 aom_highbd_12_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x64 aom_highbd_12_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x64 aom_highbd_12_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x8 aom_highbd_12_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x8 aom_highbd_12_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x16 aom_highbd_12_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x16 aom_highbd_12_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x32 aom_highbd_12_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x32 aom_highbd_12_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x64 aom_highbd_12_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x64 aom_highbd_12_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x8 aom_highbd_12_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x8 aom_highbd_12_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance4x16 aom_highbd_12_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance4x16 aom_highbd_12_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance4x4 aom_highbd_12_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance4x4 aom_highbd_12_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance4x8 aom_highbd_12_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance4x8 aom_highbd_12_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x128 aom_highbd_12_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x128 aom_highbd_12_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x16 aom_highbd_12_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x16 aom_highbd_12_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x32 aom_highbd_12_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x32 aom_highbd_12_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x64 aom_highbd_12_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x64 aom_highbd_12_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x16 aom_highbd_12_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x16 aom_highbd_12_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x32 aom_highbd_12_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x32 aom_highbd_12_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x4 aom_highbd_12_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x4 aom_highbd_12_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x8 aom_highbd_12_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x8 aom_highbd_12_sub_pixel_avg_variance8x8_neon
uint32_t aom_highbd_12_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance128x128 aom_highbd_12_sub_pixel_variance128x128_c
+uint32_t aom_highbd_12_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance128x128 aom_highbd_12_sub_pixel_variance128x128_neon
uint32_t aom_highbd_12_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance128x64 aom_highbd_12_sub_pixel_variance128x64_c
+uint32_t aom_highbd_12_sub_pixel_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance128x64 aom_highbd_12_sub_pixel_variance128x64_neon
uint32_t aom_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x16 aom_highbd_12_sub_pixel_variance16x16_c
+uint32_t aom_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x16 aom_highbd_12_sub_pixel_variance16x16_neon
uint32_t aom_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x32 aom_highbd_12_sub_pixel_variance16x32_c
+uint32_t aom_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x32 aom_highbd_12_sub_pixel_variance16x32_neon
uint32_t aom_highbd_12_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x4 aom_highbd_12_sub_pixel_variance16x4_c
+uint32_t aom_highbd_12_sub_pixel_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x4 aom_highbd_12_sub_pixel_variance16x4_neon
uint32_t aom_highbd_12_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x64 aom_highbd_12_sub_pixel_variance16x64_c
+uint32_t aom_highbd_12_sub_pixel_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x64 aom_highbd_12_sub_pixel_variance16x64_neon
uint32_t aom_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x8 aom_highbd_12_sub_pixel_variance16x8_c
+uint32_t aom_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x8 aom_highbd_12_sub_pixel_variance16x8_neon
uint32_t aom_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x16 aom_highbd_12_sub_pixel_variance32x16_c
+uint32_t aom_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x16 aom_highbd_12_sub_pixel_variance32x16_neon
uint32_t aom_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x32 aom_highbd_12_sub_pixel_variance32x32_c
+uint32_t aom_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x32 aom_highbd_12_sub_pixel_variance32x32_neon
uint32_t aom_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x64 aom_highbd_12_sub_pixel_variance32x64_c
+uint32_t aom_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x64 aom_highbd_12_sub_pixel_variance32x64_neon
uint32_t aom_highbd_12_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x8 aom_highbd_12_sub_pixel_variance32x8_c
+uint32_t aom_highbd_12_sub_pixel_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x8 aom_highbd_12_sub_pixel_variance32x8_neon
uint32_t aom_highbd_12_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance4x16 aom_highbd_12_sub_pixel_variance4x16_c
+uint32_t aom_highbd_12_sub_pixel_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance4x16 aom_highbd_12_sub_pixel_variance4x16_neon
uint32_t aom_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance4x4 aom_highbd_12_sub_pixel_variance4x4_c
+uint32_t aom_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance4x4 aom_highbd_12_sub_pixel_variance4x4_neon
uint32_t aom_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance4x8 aom_highbd_12_sub_pixel_variance4x8_c
+uint32_t aom_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance4x8 aom_highbd_12_sub_pixel_variance4x8_neon
uint32_t aom_highbd_12_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x128 aom_highbd_12_sub_pixel_variance64x128_c
+uint32_t aom_highbd_12_sub_pixel_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x128 aom_highbd_12_sub_pixel_variance64x128_neon
uint32_t aom_highbd_12_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x16 aom_highbd_12_sub_pixel_variance64x16_c
+uint32_t aom_highbd_12_sub_pixel_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x16 aom_highbd_12_sub_pixel_variance64x16_neon
uint32_t aom_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x32 aom_highbd_12_sub_pixel_variance64x32_c
+uint32_t aom_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x32 aom_highbd_12_sub_pixel_variance64x32_neon
uint32_t aom_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x64 aom_highbd_12_sub_pixel_variance64x64_c
+uint32_t aom_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x64 aom_highbd_12_sub_pixel_variance64x64_neon
uint32_t aom_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x16 aom_highbd_12_sub_pixel_variance8x16_c
+uint32_t aom_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x16 aom_highbd_12_sub_pixel_variance8x16_neon
uint32_t aom_highbd_12_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x32 aom_highbd_12_sub_pixel_variance8x32_c
+uint32_t aom_highbd_12_sub_pixel_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x32 aom_highbd_12_sub_pixel_variance8x32_neon
uint32_t aom_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x4 aom_highbd_12_sub_pixel_variance8x4_c
+uint32_t aom_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x4 aom_highbd_12_sub_pixel_variance8x4_neon
uint32_t aom_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x8 aom_highbd_12_sub_pixel_variance8x8_c
+uint32_t aom_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x8 aom_highbd_12_sub_pixel_variance8x8_neon
-unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x128 aom_highbd_12_variance128x128_neon
-unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x64 aom_highbd_12_variance128x64_neon
-unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x16 aom_highbd_12_variance16x16_neon
-unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x32 aom_highbd_12_variance16x32_neon
-unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x4 aom_highbd_12_variance16x4_neon
-unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x64 aom_highbd_12_variance16x64_neon
-unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x8 aom_highbd_12_variance16x8_neon
-unsigned int aom_highbd_12_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x2 aom_highbd_12_variance2x2_c
-
-unsigned int aom_highbd_12_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x4 aom_highbd_12_variance2x4_c
-
-unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x16 aom_highbd_12_variance32x16_neon
-unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x32 aom_highbd_12_variance32x32_neon
-unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x64 aom_highbd_12_variance32x64_neon
-unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x8 aom_highbd_12_variance32x8_neon
-unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x16 aom_highbd_12_variance4x16_neon
-unsigned int aom_highbd_12_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance4x2 aom_highbd_12_variance4x2_c
-
-unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x4 aom_highbd_12_variance4x4_neon
-unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x8 aom_highbd_12_variance4x8_neon
-unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x128 aom_highbd_12_variance64x128_neon
-unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x16 aom_highbd_12_variance64x16_neon
-unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x32 aom_highbd_12_variance64x32_neon
-unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x64 aom_highbd_12_variance64x64_neon
-unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x16 aom_highbd_12_variance8x16_neon
-unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x32 aom_highbd_12_variance8x32_neon
-unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x4 aom_highbd_12_variance8x4_neon
-unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x8 aom_highbd_12_variance8x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance128x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance128x128 aom_highbd_8_masked_sub_pixel_variance128x128_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance128x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance128x128 aom_highbd_8_masked_sub_pixel_variance128x128_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance128x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance128x64 aom_highbd_8_masked_sub_pixel_variance128x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance128x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance128x64 aom_highbd_8_masked_sub_pixel_variance128x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x16 aom_highbd_8_masked_sub_pixel_variance16x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x16 aom_highbd_8_masked_sub_pixel_variance16x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x32 aom_highbd_8_masked_sub_pixel_variance16x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x32 aom_highbd_8_masked_sub_pixel_variance16x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x4 aom_highbd_8_masked_sub_pixel_variance16x4_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x4 aom_highbd_8_masked_sub_pixel_variance16x4_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x64 aom_highbd_8_masked_sub_pixel_variance16x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x64 aom_highbd_8_masked_sub_pixel_variance16x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x8 aom_highbd_8_masked_sub_pixel_variance16x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x8 aom_highbd_8_masked_sub_pixel_variance16x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x16 aom_highbd_8_masked_sub_pixel_variance32x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x16 aom_highbd_8_masked_sub_pixel_variance32x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x32 aom_highbd_8_masked_sub_pixel_variance32x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x32 aom_highbd_8_masked_sub_pixel_variance32x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x64 aom_highbd_8_masked_sub_pixel_variance32x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x64 aom_highbd_8_masked_sub_pixel_variance32x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x8 aom_highbd_8_masked_sub_pixel_variance32x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x8 aom_highbd_8_masked_sub_pixel_variance32x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance4x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance4x16 aom_highbd_8_masked_sub_pixel_variance4x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance4x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance4x16 aom_highbd_8_masked_sub_pixel_variance4x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance4x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance4x4 aom_highbd_8_masked_sub_pixel_variance4x4_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance4x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance4x4 aom_highbd_8_masked_sub_pixel_variance4x4_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance4x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance4x8 aom_highbd_8_masked_sub_pixel_variance4x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance4x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance4x8 aom_highbd_8_masked_sub_pixel_variance4x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x128 aom_highbd_8_masked_sub_pixel_variance64x128_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x128 aom_highbd_8_masked_sub_pixel_variance64x128_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x16 aom_highbd_8_masked_sub_pixel_variance64x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x16 aom_highbd_8_masked_sub_pixel_variance64x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x32 aom_highbd_8_masked_sub_pixel_variance64x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x32 aom_highbd_8_masked_sub_pixel_variance64x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x64 aom_highbd_8_masked_sub_pixel_variance64x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x64 aom_highbd_8_masked_sub_pixel_variance64x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x16 aom_highbd_8_masked_sub_pixel_variance8x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x16 aom_highbd_8_masked_sub_pixel_variance8x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x32 aom_highbd_8_masked_sub_pixel_variance8x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x32 aom_highbd_8_masked_sub_pixel_variance8x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x4 aom_highbd_8_masked_sub_pixel_variance8x4_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x4 aom_highbd_8_masked_sub_pixel_variance8x4_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x8 aom_highbd_8_masked_sub_pixel_variance8x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x8 aom_highbd_8_masked_sub_pixel_variance8x8_neon
unsigned int aom_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -1817,233 +2156,444 @@ unsigned int aom_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, c
unsigned int aom_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_8_mse8x8 aom_highbd_8_mse8x8_neon
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x128 aom_highbd_8_obmc_sub_pixel_variance128x128_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x64 aom_highbd_8_obmc_sub_pixel_variance128x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x16 aom_highbd_8_obmc_sub_pixel_variance16x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x32 aom_highbd_8_obmc_sub_pixel_variance16x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x4 aom_highbd_8_obmc_sub_pixel_variance16x4_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x64 aom_highbd_8_obmc_sub_pixel_variance16x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x8 aom_highbd_8_obmc_sub_pixel_variance16x8_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x16 aom_highbd_8_obmc_sub_pixel_variance32x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x32 aom_highbd_8_obmc_sub_pixel_variance32x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x64 aom_highbd_8_obmc_sub_pixel_variance32x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x8 aom_highbd_8_obmc_sub_pixel_variance32x8_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x16 aom_highbd_8_obmc_sub_pixel_variance4x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x4 aom_highbd_8_obmc_sub_pixel_variance4x4_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x8 aom_highbd_8_obmc_sub_pixel_variance4x8_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x128 aom_highbd_8_obmc_sub_pixel_variance64x128_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x16 aom_highbd_8_obmc_sub_pixel_variance64x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x32 aom_highbd_8_obmc_sub_pixel_variance64x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x64 aom_highbd_8_obmc_sub_pixel_variance64x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x16 aom_highbd_8_obmc_sub_pixel_variance8x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x32 aom_highbd_8_obmc_sub_pixel_variance8x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x4 aom_highbd_8_obmc_sub_pixel_variance8x4_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x8 aom_highbd_8_obmc_sub_pixel_variance8x8_neon
+
+unsigned int aom_highbd_8_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x128 aom_highbd_8_obmc_variance128x128_neon
+
+unsigned int aom_highbd_8_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x64 aom_highbd_8_obmc_variance128x64_neon
+
+unsigned int aom_highbd_8_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x16 aom_highbd_8_obmc_variance16x16_neon
+
+unsigned int aom_highbd_8_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x32 aom_highbd_8_obmc_variance16x32_neon
+
+unsigned int aom_highbd_8_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x4 aom_highbd_8_obmc_variance16x4_neon
+
+unsigned int aom_highbd_8_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x64 aom_highbd_8_obmc_variance16x64_neon
+
+unsigned int aom_highbd_8_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x8 aom_highbd_8_obmc_variance16x8_neon
+
+unsigned int aom_highbd_8_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x16 aom_highbd_8_obmc_variance32x16_neon
+
+unsigned int aom_highbd_8_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x32 aom_highbd_8_obmc_variance32x32_neon
+
+unsigned int aom_highbd_8_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x64 aom_highbd_8_obmc_variance32x64_neon
+
+unsigned int aom_highbd_8_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x8 aom_highbd_8_obmc_variance32x8_neon
+
+unsigned int aom_highbd_8_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x16 aom_highbd_8_obmc_variance4x16_neon
+
+unsigned int aom_highbd_8_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x4 aom_highbd_8_obmc_variance4x4_neon
+
+unsigned int aom_highbd_8_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x8 aom_highbd_8_obmc_variance4x8_neon
+
+unsigned int aom_highbd_8_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x128 aom_highbd_8_obmc_variance64x128_neon
+
+unsigned int aom_highbd_8_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x16 aom_highbd_8_obmc_variance64x16_neon
+
+unsigned int aom_highbd_8_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x32 aom_highbd_8_obmc_variance64x32_neon
+
+unsigned int aom_highbd_8_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x64 aom_highbd_8_obmc_variance64x64_neon
+
+unsigned int aom_highbd_8_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x16 aom_highbd_8_obmc_variance8x16_neon
+
+unsigned int aom_highbd_8_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x32 aom_highbd_8_obmc_variance8x32_neon
+
+unsigned int aom_highbd_8_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x4 aom_highbd_8_obmc_variance8x4_neon
+
+unsigned int aom_highbd_8_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x8 aom_highbd_8_obmc_variance8x8_neon
+
uint32_t aom_highbd_8_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance128x128 aom_highbd_8_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance128x128 aom_highbd_8_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance128x64 aom_highbd_8_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance128x64 aom_highbd_8_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x16 aom_highbd_8_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x16 aom_highbd_8_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x32 aom_highbd_8_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x32 aom_highbd_8_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x4 aom_highbd_8_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x4 aom_highbd_8_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x64 aom_highbd_8_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x64 aom_highbd_8_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x8 aom_highbd_8_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x8 aom_highbd_8_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x16 aom_highbd_8_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x16 aom_highbd_8_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x32 aom_highbd_8_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x32 aom_highbd_8_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x64 aom_highbd_8_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x64 aom_highbd_8_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x8 aom_highbd_8_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x8 aom_highbd_8_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance4x16 aom_highbd_8_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance4x16 aom_highbd_8_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance4x4 aom_highbd_8_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance4x4 aom_highbd_8_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance4x8 aom_highbd_8_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance4x8 aom_highbd_8_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x128 aom_highbd_8_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x128 aom_highbd_8_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x16 aom_highbd_8_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x16 aom_highbd_8_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x32 aom_highbd_8_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x32 aom_highbd_8_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x64 aom_highbd_8_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x64 aom_highbd_8_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x16 aom_highbd_8_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x16 aom_highbd_8_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x32 aom_highbd_8_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x32 aom_highbd_8_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x4 aom_highbd_8_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x4 aom_highbd_8_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x8 aom_highbd_8_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x8 aom_highbd_8_sub_pixel_avg_variance8x8_neon
uint32_t aom_highbd_8_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance128x128 aom_highbd_8_sub_pixel_variance128x128_c
+uint32_t aom_highbd_8_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance128x128 aom_highbd_8_sub_pixel_variance128x128_neon
uint32_t aom_highbd_8_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance128x64 aom_highbd_8_sub_pixel_variance128x64_c
+uint32_t aom_highbd_8_sub_pixel_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance128x64 aom_highbd_8_sub_pixel_variance128x64_neon
uint32_t aom_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x16 aom_highbd_8_sub_pixel_variance16x16_c
+uint32_t aom_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x16 aom_highbd_8_sub_pixel_variance16x16_neon
uint32_t aom_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x32 aom_highbd_8_sub_pixel_variance16x32_c
+uint32_t aom_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x32 aom_highbd_8_sub_pixel_variance16x32_neon
uint32_t aom_highbd_8_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x4 aom_highbd_8_sub_pixel_variance16x4_c
+uint32_t aom_highbd_8_sub_pixel_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x4 aom_highbd_8_sub_pixel_variance16x4_neon
uint32_t aom_highbd_8_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x64 aom_highbd_8_sub_pixel_variance16x64_c
+uint32_t aom_highbd_8_sub_pixel_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x64 aom_highbd_8_sub_pixel_variance16x64_neon
uint32_t aom_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x8 aom_highbd_8_sub_pixel_variance16x8_c
+uint32_t aom_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x8 aom_highbd_8_sub_pixel_variance16x8_neon
uint32_t aom_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x16 aom_highbd_8_sub_pixel_variance32x16_c
+uint32_t aom_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x16 aom_highbd_8_sub_pixel_variance32x16_neon
uint32_t aom_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x32 aom_highbd_8_sub_pixel_variance32x32_c
+uint32_t aom_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x32 aom_highbd_8_sub_pixel_variance32x32_neon
uint32_t aom_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x64 aom_highbd_8_sub_pixel_variance32x64_c
+uint32_t aom_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x64 aom_highbd_8_sub_pixel_variance32x64_neon
uint32_t aom_highbd_8_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x8 aom_highbd_8_sub_pixel_variance32x8_c
+uint32_t aom_highbd_8_sub_pixel_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x8 aom_highbd_8_sub_pixel_variance32x8_neon
uint32_t aom_highbd_8_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance4x16 aom_highbd_8_sub_pixel_variance4x16_c
+uint32_t aom_highbd_8_sub_pixel_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance4x16 aom_highbd_8_sub_pixel_variance4x16_neon
uint32_t aom_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance4x4 aom_highbd_8_sub_pixel_variance4x4_c
+uint32_t aom_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance4x4 aom_highbd_8_sub_pixel_variance4x4_neon
uint32_t aom_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance4x8 aom_highbd_8_sub_pixel_variance4x8_c
+uint32_t aom_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance4x8 aom_highbd_8_sub_pixel_variance4x8_neon
uint32_t aom_highbd_8_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x128 aom_highbd_8_sub_pixel_variance64x128_c
+uint32_t aom_highbd_8_sub_pixel_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x128 aom_highbd_8_sub_pixel_variance64x128_neon
uint32_t aom_highbd_8_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x16 aom_highbd_8_sub_pixel_variance64x16_c
+uint32_t aom_highbd_8_sub_pixel_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x16 aom_highbd_8_sub_pixel_variance64x16_neon
uint32_t aom_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x32 aom_highbd_8_sub_pixel_variance64x32_c
+uint32_t aom_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x32 aom_highbd_8_sub_pixel_variance64x32_neon
uint32_t aom_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x64 aom_highbd_8_sub_pixel_variance64x64_c
+uint32_t aom_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x64 aom_highbd_8_sub_pixel_variance64x64_neon
uint32_t aom_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x16 aom_highbd_8_sub_pixel_variance8x16_c
+uint32_t aom_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x16 aom_highbd_8_sub_pixel_variance8x16_neon
uint32_t aom_highbd_8_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x32 aom_highbd_8_sub_pixel_variance8x32_c
+uint32_t aom_highbd_8_sub_pixel_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x32 aom_highbd_8_sub_pixel_variance8x32_neon
uint32_t aom_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x4 aom_highbd_8_sub_pixel_variance8x4_c
+uint32_t aom_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x4 aom_highbd_8_sub_pixel_variance8x4_neon
uint32_t aom_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x8 aom_highbd_8_sub_pixel_variance8x8_c
+uint32_t aom_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x8 aom_highbd_8_sub_pixel_variance8x8_neon
-unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x128 aom_highbd_8_variance128x128_neon
-unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x64 aom_highbd_8_variance128x64_neon
-unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x16 aom_highbd_8_variance16x16_neon
-unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x32 aom_highbd_8_variance16x32_neon
-unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x4 aom_highbd_8_variance16x4_neon
-unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x64 aom_highbd_8_variance16x64_neon
-unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x8 aom_highbd_8_variance16x8_neon
-unsigned int aom_highbd_8_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x2 aom_highbd_8_variance2x2_c
-
-unsigned int aom_highbd_8_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x4 aom_highbd_8_variance2x4_c
-
-unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x16 aom_highbd_8_variance32x16_neon
-unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x32 aom_highbd_8_variance32x32_neon
-unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x64 aom_highbd_8_variance32x64_neon
-unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x8 aom_highbd_8_variance32x8_neon
-unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x16 aom_highbd_8_variance4x16_neon
-unsigned int aom_highbd_8_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance4x2 aom_highbd_8_variance4x2_c
-
-unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x4 aom_highbd_8_variance4x4_neon
-unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x8 aom_highbd_8_variance4x8_neon
-unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x128 aom_highbd_8_variance64x128_neon
-unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x16 aom_highbd_8_variance64x16_neon
-unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x32 aom_highbd_8_variance64x32_neon
-unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x64 aom_highbd_8_variance64x64_neon
-unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x16 aom_highbd_8_variance8x16_neon
-unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x32 aom_highbd_8_variance8x32_neon
-unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x4 aom_highbd_8_variance8x4_neon
-unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x8 aom_highbd_8_variance8x8_neon
unsigned int aom_highbd_avg_4x4_c(const uint8_t *, int p);
@@ -2055,31 +2605,40 @@ unsigned int aom_highbd_avg_8x8_neon(const uint8_t *, int p);
#define aom_highbd_avg_8x8 aom_highbd_avg_8x8_neon
void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
-#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+void aom_highbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_neon
void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
-#define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_neon
void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
-#define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
+#define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_neon
void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
-#define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_neon
void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
-#define aom_highbd_comp_avg_pred aom_highbd_comp_avg_pred_c
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define aom_highbd_comp_avg_pred aom_highbd_comp_avg_pred_neon
void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
-#define aom_highbd_comp_mask_pred aom_highbd_comp_mask_pred_c
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+#define aom_highbd_comp_mask_pred aom_highbd_comp_mask_pred_neon
void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
-#define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_c
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+#define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_neon
void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
-#define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_c
+void aom_highbd_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+#define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_neon
void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h);
-#define aom_highbd_convolve_copy aom_highbd_convolve_copy_c
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h);
+#define aom_highbd_convolve_copy aom_highbd_convolve_copy_neon
void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
void aom_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -2386,7 +2945,8 @@ void aom_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, con
#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_neon
void aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_highbd_dist_wtd_comp_avg_pred aom_highbd_dist_wtd_comp_avg_pred_c
+void aom_highbd_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_highbd_dist_wtd_comp_avg_pred aom_highbd_dist_wtd_comp_avg_pred_neon
unsigned int aom_highbd_dist_wtd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
#define aom_highbd_dist_wtd_sad128x128_avg aom_highbd_dist_wtd_sad128x128_avg_c
@@ -2607,272 +3167,184 @@ void aom_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *
#define aom_highbd_lpf_vertical_8_dual aom_highbd_lpf_vertical_8_dual_neon
unsigned int aom_highbd_masked_sad128x128_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad128x128 aom_highbd_masked_sad128x128_c
+unsigned int aom_highbd_masked_sad128x128_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad128x128 aom_highbd_masked_sad128x128_neon
unsigned int aom_highbd_masked_sad128x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad128x64 aom_highbd_masked_sad128x64_c
+unsigned int aom_highbd_masked_sad128x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad128x64 aom_highbd_masked_sad128x64_neon
unsigned int aom_highbd_masked_sad16x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x16 aom_highbd_masked_sad16x16_c
+unsigned int aom_highbd_masked_sad16x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x16 aom_highbd_masked_sad16x16_neon
unsigned int aom_highbd_masked_sad16x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x32 aom_highbd_masked_sad16x32_c
+unsigned int aom_highbd_masked_sad16x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x32 aom_highbd_masked_sad16x32_neon
unsigned int aom_highbd_masked_sad16x4_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x4 aom_highbd_masked_sad16x4_c
+unsigned int aom_highbd_masked_sad16x4_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x4 aom_highbd_masked_sad16x4_neon
unsigned int aom_highbd_masked_sad16x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x64 aom_highbd_masked_sad16x64_c
+unsigned int aom_highbd_masked_sad16x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x64 aom_highbd_masked_sad16x64_neon
unsigned int aom_highbd_masked_sad16x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x8 aom_highbd_masked_sad16x8_c
+unsigned int aom_highbd_masked_sad16x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x8 aom_highbd_masked_sad16x8_neon
unsigned int aom_highbd_masked_sad32x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x16 aom_highbd_masked_sad32x16_c
+unsigned int aom_highbd_masked_sad32x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x16 aom_highbd_masked_sad32x16_neon
unsigned int aom_highbd_masked_sad32x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x32 aom_highbd_masked_sad32x32_c
+unsigned int aom_highbd_masked_sad32x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x32 aom_highbd_masked_sad32x32_neon
unsigned int aom_highbd_masked_sad32x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x64 aom_highbd_masked_sad32x64_c
+unsigned int aom_highbd_masked_sad32x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x64 aom_highbd_masked_sad32x64_neon
unsigned int aom_highbd_masked_sad32x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x8 aom_highbd_masked_sad32x8_c
+unsigned int aom_highbd_masked_sad32x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x8 aom_highbd_masked_sad32x8_neon
unsigned int aom_highbd_masked_sad4x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad4x16 aom_highbd_masked_sad4x16_c
+unsigned int aom_highbd_masked_sad4x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad4x16 aom_highbd_masked_sad4x16_neon
unsigned int aom_highbd_masked_sad4x4_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad4x4 aom_highbd_masked_sad4x4_c
+unsigned int aom_highbd_masked_sad4x4_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad4x4 aom_highbd_masked_sad4x4_neon
unsigned int aom_highbd_masked_sad4x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad4x8 aom_highbd_masked_sad4x8_c
+unsigned int aom_highbd_masked_sad4x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad4x8 aom_highbd_masked_sad4x8_neon
unsigned int aom_highbd_masked_sad64x128_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x128 aom_highbd_masked_sad64x128_c
+unsigned int aom_highbd_masked_sad64x128_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x128 aom_highbd_masked_sad64x128_neon
unsigned int aom_highbd_masked_sad64x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x16 aom_highbd_masked_sad64x16_c
+unsigned int aom_highbd_masked_sad64x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x16 aom_highbd_masked_sad64x16_neon
unsigned int aom_highbd_masked_sad64x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x32 aom_highbd_masked_sad64x32_c
+unsigned int aom_highbd_masked_sad64x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x32 aom_highbd_masked_sad64x32_neon
unsigned int aom_highbd_masked_sad64x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x64 aom_highbd_masked_sad64x64_c
+unsigned int aom_highbd_masked_sad64x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x64 aom_highbd_masked_sad64x64_neon
unsigned int aom_highbd_masked_sad8x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x16 aom_highbd_masked_sad8x16_c
+unsigned int aom_highbd_masked_sad8x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x16 aom_highbd_masked_sad8x16_neon
unsigned int aom_highbd_masked_sad8x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x32 aom_highbd_masked_sad8x32_c
+unsigned int aom_highbd_masked_sad8x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x32 aom_highbd_masked_sad8x32_neon
unsigned int aom_highbd_masked_sad8x4_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x4 aom_highbd_masked_sad8x4_c
+unsigned int aom_highbd_masked_sad8x4_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x4 aom_highbd_masked_sad8x4_neon
unsigned int aom_highbd_masked_sad8x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x8 aom_highbd_masked_sad8x8_c
+unsigned int aom_highbd_masked_sad8x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x8 aom_highbd_masked_sad8x8_neon
void aom_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void aom_highbd_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
#define aom_highbd_minmax_8x8 aom_highbd_minmax_8x8_neon
unsigned int aom_highbd_obmc_sad128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad128x128 aom_highbd_obmc_sad128x128_c
+unsigned int aom_highbd_obmc_sad128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad128x128 aom_highbd_obmc_sad128x128_neon
unsigned int aom_highbd_obmc_sad128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad128x64 aom_highbd_obmc_sad128x64_c
+unsigned int aom_highbd_obmc_sad128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad128x64 aom_highbd_obmc_sad128x64_neon
unsigned int aom_highbd_obmc_sad16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x16 aom_highbd_obmc_sad16x16_c
+unsigned int aom_highbd_obmc_sad16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x16 aom_highbd_obmc_sad16x16_neon
unsigned int aom_highbd_obmc_sad16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x32 aom_highbd_obmc_sad16x32_c
+unsigned int aom_highbd_obmc_sad16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x32 aom_highbd_obmc_sad16x32_neon
unsigned int aom_highbd_obmc_sad16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x4 aom_highbd_obmc_sad16x4_c
+unsigned int aom_highbd_obmc_sad16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x4 aom_highbd_obmc_sad16x4_neon
unsigned int aom_highbd_obmc_sad16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x64 aom_highbd_obmc_sad16x64_c
+unsigned int aom_highbd_obmc_sad16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x64 aom_highbd_obmc_sad16x64_neon
unsigned int aom_highbd_obmc_sad16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x8 aom_highbd_obmc_sad16x8_c
+unsigned int aom_highbd_obmc_sad16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x8 aom_highbd_obmc_sad16x8_neon
unsigned int aom_highbd_obmc_sad32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x16 aom_highbd_obmc_sad32x16_c
+unsigned int aom_highbd_obmc_sad32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x16 aom_highbd_obmc_sad32x16_neon
unsigned int aom_highbd_obmc_sad32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x32 aom_highbd_obmc_sad32x32_c
+unsigned int aom_highbd_obmc_sad32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x32 aom_highbd_obmc_sad32x32_neon
unsigned int aom_highbd_obmc_sad32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x64 aom_highbd_obmc_sad32x64_c
+unsigned int aom_highbd_obmc_sad32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x64 aom_highbd_obmc_sad32x64_neon
unsigned int aom_highbd_obmc_sad32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x8 aom_highbd_obmc_sad32x8_c
+unsigned int aom_highbd_obmc_sad32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x8 aom_highbd_obmc_sad32x8_neon
unsigned int aom_highbd_obmc_sad4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad4x16 aom_highbd_obmc_sad4x16_c
+unsigned int aom_highbd_obmc_sad4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad4x16 aom_highbd_obmc_sad4x16_neon
unsigned int aom_highbd_obmc_sad4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad4x4 aom_highbd_obmc_sad4x4_c
+unsigned int aom_highbd_obmc_sad4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad4x4 aom_highbd_obmc_sad4x4_neon
unsigned int aom_highbd_obmc_sad4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad4x8 aom_highbd_obmc_sad4x8_c
+unsigned int aom_highbd_obmc_sad4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad4x8 aom_highbd_obmc_sad4x8_neon
unsigned int aom_highbd_obmc_sad64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x128 aom_highbd_obmc_sad64x128_c
+unsigned int aom_highbd_obmc_sad64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x128 aom_highbd_obmc_sad64x128_neon
unsigned int aom_highbd_obmc_sad64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x16 aom_highbd_obmc_sad64x16_c
+unsigned int aom_highbd_obmc_sad64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x16 aom_highbd_obmc_sad64x16_neon
unsigned int aom_highbd_obmc_sad64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x32 aom_highbd_obmc_sad64x32_c
+unsigned int aom_highbd_obmc_sad64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x32 aom_highbd_obmc_sad64x32_neon
unsigned int aom_highbd_obmc_sad64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x64 aom_highbd_obmc_sad64x64_c
+unsigned int aom_highbd_obmc_sad64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x64 aom_highbd_obmc_sad64x64_neon
unsigned int aom_highbd_obmc_sad8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x16 aom_highbd_obmc_sad8x16_c
+unsigned int aom_highbd_obmc_sad8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x16 aom_highbd_obmc_sad8x16_neon
unsigned int aom_highbd_obmc_sad8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x32 aom_highbd_obmc_sad8x32_c
+unsigned int aom_highbd_obmc_sad8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x32 aom_highbd_obmc_sad8x32_neon
unsigned int aom_highbd_obmc_sad8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x4 aom_highbd_obmc_sad8x4_c
+unsigned int aom_highbd_obmc_sad8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x4 aom_highbd_obmc_sad8x4_neon
unsigned int aom_highbd_obmc_sad8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x8 aom_highbd_obmc_sad8x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x128 aom_highbd_obmc_sub_pixel_variance128x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x64 aom_highbd_obmc_sub_pixel_variance128x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x16 aom_highbd_obmc_sub_pixel_variance16x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x32 aom_highbd_obmc_sub_pixel_variance16x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x4 aom_highbd_obmc_sub_pixel_variance16x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x64 aom_highbd_obmc_sub_pixel_variance16x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x8 aom_highbd_obmc_sub_pixel_variance16x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x16 aom_highbd_obmc_sub_pixel_variance32x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x32 aom_highbd_obmc_sub_pixel_variance32x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x64 aom_highbd_obmc_sub_pixel_variance32x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x8 aom_highbd_obmc_sub_pixel_variance32x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x16 aom_highbd_obmc_sub_pixel_variance4x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x4 aom_highbd_obmc_sub_pixel_variance4x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x8 aom_highbd_obmc_sub_pixel_variance4x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x128 aom_highbd_obmc_sub_pixel_variance64x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x16 aom_highbd_obmc_sub_pixel_variance64x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x32 aom_highbd_obmc_sub_pixel_variance64x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x64 aom_highbd_obmc_sub_pixel_variance64x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x16 aom_highbd_obmc_sub_pixel_variance8x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x32 aom_highbd_obmc_sub_pixel_variance8x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x4 aom_highbd_obmc_sub_pixel_variance8x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x8 aom_highbd_obmc_sub_pixel_variance8x8_c
-
-unsigned int aom_highbd_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x128 aom_highbd_obmc_variance128x128_c
-
-unsigned int aom_highbd_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x64 aom_highbd_obmc_variance128x64_c
-
-unsigned int aom_highbd_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x16 aom_highbd_obmc_variance16x16_c
-
-unsigned int aom_highbd_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x32 aom_highbd_obmc_variance16x32_c
-
-unsigned int aom_highbd_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x4 aom_highbd_obmc_variance16x4_c
-
-unsigned int aom_highbd_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x64 aom_highbd_obmc_variance16x64_c
-
-unsigned int aom_highbd_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x8 aom_highbd_obmc_variance16x8_c
-
-unsigned int aom_highbd_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x16 aom_highbd_obmc_variance32x16_c
-
-unsigned int aom_highbd_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x32 aom_highbd_obmc_variance32x32_c
-
-unsigned int aom_highbd_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x64 aom_highbd_obmc_variance32x64_c
-
-unsigned int aom_highbd_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x8 aom_highbd_obmc_variance32x8_c
-
-unsigned int aom_highbd_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x16 aom_highbd_obmc_variance4x16_c
-
-unsigned int aom_highbd_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x4 aom_highbd_obmc_variance4x4_c
-
-unsigned int aom_highbd_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x8 aom_highbd_obmc_variance4x8_c
-
-unsigned int aom_highbd_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x128 aom_highbd_obmc_variance64x128_c
-
-unsigned int aom_highbd_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x16 aom_highbd_obmc_variance64x16_c
-
-unsigned int aom_highbd_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x32 aom_highbd_obmc_variance64x32_c
-
-unsigned int aom_highbd_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x64 aom_highbd_obmc_variance64x64_c
-
-unsigned int aom_highbd_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x16 aom_highbd_obmc_variance8x16_c
-
-unsigned int aom_highbd_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x32 aom_highbd_obmc_variance8x32_c
-
-unsigned int aom_highbd_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x4 aom_highbd_obmc_variance8x4_c
-
-unsigned int aom_highbd_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x8 aom_highbd_obmc_variance8x8_c
+unsigned int aom_highbd_obmc_sad8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x8 aom_highbd_obmc_sad8x8_neon
void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
void aom_highbd_paeth_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -2979,13 +3451,15 @@ unsigned int aom_highbd_sad128x128_neon(const uint8_t *src_ptr, int src_stride,
#define aom_highbd_sad128x128 aom_highbd_sad128x128_neon
unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
+unsigned int aom_highbd_sad128x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_neon
-void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x128x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_neon
-void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_neon
unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2993,13 +3467,15 @@ unsigned int aom_highbd_sad128x64_neon(const uint8_t *src_ptr, int src_stride, c
#define aom_highbd_sad128x64 aom_highbd_sad128x64_neon
unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
+unsigned int aom_highbd_sad128x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_neon
-void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_neon
-void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_neon
unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3007,13 +3483,15 @@ unsigned int aom_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad16x16 aom_highbd_sad16x16_neon
unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_c
+unsigned int aom_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_neon
-void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_neon
-void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_neon
unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3021,13 +3499,15 @@ unsigned int aom_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad16x32 aom_highbd_sad16x32_neon
unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_c
+unsigned int aom_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_neon
-void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_neon
-void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_neon
unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3035,13 +3515,15 @@ unsigned int aom_highbd_sad16x4_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad16x4 aom_highbd_sad16x4_neon
unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_c
+unsigned int aom_highbd_sad16x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_neon
-void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x4x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_neon
-void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_neon
unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3049,13 +3531,15 @@ unsigned int aom_highbd_sad16x64_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad16x64 aom_highbd_sad16x64_neon
unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_c
+unsigned int aom_highbd_sad16x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_neon
-void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_neon
-void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_neon
unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3063,13 +3547,15 @@ unsigned int aom_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad16x8 aom_highbd_sad16x8_neon
unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_c
+unsigned int aom_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_neon
-void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_neon
-void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_neon
unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3077,13 +3563,15 @@ unsigned int aom_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad32x16 aom_highbd_sad32x16_neon
unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_c
+unsigned int aom_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_neon
-void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_neon
-void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_neon
unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3091,13 +3579,15 @@ unsigned int aom_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad32x32 aom_highbd_sad32x32_neon
unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_c
+unsigned int aom_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_neon
-void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_neon
-void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_neon
unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3105,13 +3595,15 @@ unsigned int aom_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad32x64 aom_highbd_sad32x64_neon
unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_c
+unsigned int aom_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_neon
-void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_neon
-void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_neon
unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3119,13 +3611,15 @@ unsigned int aom_highbd_sad32x8_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad32x8 aom_highbd_sad32x8_neon
unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_c
+unsigned int aom_highbd_sad32x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_neon
-void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_neon
-void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_neon
unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3133,13 +3627,15 @@ unsigned int aom_highbd_sad4x16_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad4x16 aom_highbd_sad4x16_neon
unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_c
+unsigned int aom_highbd_sad4x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_neon
-void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_neon
-void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_neon
unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3147,13 +3643,15 @@ unsigned int aom_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad4x4 aom_highbd_sad4x4_neon
unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_c
+unsigned int aom_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_neon
-void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x4x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_neon
-void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_neon
unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3161,13 +3659,15 @@ unsigned int aom_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad4x8 aom_highbd_sad4x8_neon
unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_c
+unsigned int aom_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_neon
-void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_neon
-void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_neon
unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3175,13 +3675,15 @@ unsigned int aom_highbd_sad64x128_neon(const uint8_t *src_ptr, int src_stride, c
#define aom_highbd_sad64x128 aom_highbd_sad64x128_neon
unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
+unsigned int aom_highbd_sad64x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_neon
-void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x128x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_neon
-void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_neon
unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3189,13 +3691,15 @@ unsigned int aom_highbd_sad64x16_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad64x16 aom_highbd_sad64x16_neon
unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_c
+unsigned int aom_highbd_sad64x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_neon
-void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_neon
-void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_neon
unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3203,13 +3707,15 @@ unsigned int aom_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad64x32 aom_highbd_sad64x32_neon
unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_c
+unsigned int aom_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_neon
-void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_neon
-void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_neon
unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3217,13 +3723,15 @@ unsigned int aom_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad64x64 aom_highbd_sad64x64_neon
unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_c
+unsigned int aom_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_neon
-void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_neon
-void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_neon
unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3231,13 +3739,15 @@ unsigned int aom_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad8x16 aom_highbd_sad8x16_neon
unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_c
+unsigned int aom_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_neon
-void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_neon
-void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_neon
unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3245,13 +3755,15 @@ unsigned int aom_highbd_sad8x32_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad8x32 aom_highbd_sad8x32_neon
unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_c
+unsigned int aom_highbd_sad8x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_neon
-void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_neon
-void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_neon
unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3259,13 +3771,15 @@ unsigned int aom_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad8x4 aom_highbd_sad8x4_neon
unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_c
+unsigned int aom_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_neon
-void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x4x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_neon
-void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_neon
unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3273,189 +3787,191 @@ unsigned int aom_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad8x8 aom_highbd_sad8x8_neon
unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_c
+unsigned int aom_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_neon
-void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_neon
-void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_neon
unsigned int aom_highbd_sad_skip_128x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_128x128_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x128 aom_highbd_sad_skip_128x128_neon
-void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x128x4d aom_highbd_sad_skip_128x128x4d_neon
unsigned int aom_highbd_sad_skip_128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_128x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x64 aom_highbd_sad_skip_128x64_neon
-void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x64x4d aom_highbd_sad_skip_128x64x4d_neon
unsigned int aom_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x16 aom_highbd_sad_skip_16x16_neon
-void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x16x4d aom_highbd_sad_skip_16x16x4d_neon
unsigned int aom_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x32 aom_highbd_sad_skip_16x32_neon
-void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x32x4d aom_highbd_sad_skip_16x32x4d_neon
unsigned int aom_highbd_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x4 aom_highbd_sad_skip_16x4_neon
-void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x4x4d aom_highbd_sad_skip_16x4x4d_neon
unsigned int aom_highbd_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x64 aom_highbd_sad_skip_16x64_neon
-void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x64x4d aom_highbd_sad_skip_16x64x4d_neon
unsigned int aom_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x8 aom_highbd_sad_skip_16x8_neon
-void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x8x4d aom_highbd_sad_skip_16x8x4d_neon
unsigned int aom_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x16 aom_highbd_sad_skip_32x16_neon
-void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x16x4d aom_highbd_sad_skip_32x16x4d_neon
unsigned int aom_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x32 aom_highbd_sad_skip_32x32_neon
-void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x32x4d aom_highbd_sad_skip_32x32x4d_neon
unsigned int aom_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x64 aom_highbd_sad_skip_32x64_neon
-void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x64x4d aom_highbd_sad_skip_32x64x4d_neon
unsigned int aom_highbd_sad_skip_32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x8 aom_highbd_sad_skip_32x8_neon
-void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x8x4d aom_highbd_sad_skip_32x8x4d_neon
unsigned int aom_highbd_sad_skip_4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x16 aom_highbd_sad_skip_4x16_neon
-void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x16x4d aom_highbd_sad_skip_4x16x4d_neon
unsigned int aom_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x4 aom_highbd_sad_skip_4x4_neon
-void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x4x4d aom_highbd_sad_skip_4x4x4d_neon
unsigned int aom_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x8 aom_highbd_sad_skip_4x8_neon
-void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x8x4d aom_highbd_sad_skip_4x8x4d_neon
unsigned int aom_highbd_sad_skip_64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x128_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x128 aom_highbd_sad_skip_64x128_neon
-void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x128x4d aom_highbd_sad_skip_64x128x4d_neon
unsigned int aom_highbd_sad_skip_64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x16 aom_highbd_sad_skip_64x16_neon
-void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x16x4d aom_highbd_sad_skip_64x16x4d_neon
unsigned int aom_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x32 aom_highbd_sad_skip_64x32_neon
-void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x32x4d aom_highbd_sad_skip_64x32x4d_neon
unsigned int aom_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x64 aom_highbd_sad_skip_64x64_neon
-void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x64x4d aom_highbd_sad_skip_64x64x4d_neon
unsigned int aom_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x16 aom_highbd_sad_skip_8x16_neon
-void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x16x4d aom_highbd_sad_skip_8x16x4d_neon
unsigned int aom_highbd_sad_skip_8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x32 aom_highbd_sad_skip_8x32_neon
-void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x32x4d aom_highbd_sad_skip_8x32x4d_neon
unsigned int aom_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x4 aom_highbd_sad_skip_8x4_neon
-void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x4x4d aom_highbd_sad_skip_8x4x4d_neon
unsigned int aom_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x8 aom_highbd_sad_skip_8x8_neon
-void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x8x4d aom_highbd_sad_skip_8x8x4d_neon
void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -4181,14 +4697,16 @@ unsigned int aom_mse8x8_neon(const uint8_t *src_ptr, int source_stride, const u
#define aom_mse8x8 aom_mse8x8_neon
uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
-#define aom_mse_16xh_16bit aom_mse_16xh_16bit_c
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+#define aom_mse_16xh_16bit aom_mse_16xh_16bit_neon
uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
#define aom_mse_wxh_16bit aom_mse_wxh_16bit_neon
uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
-#define aom_mse_wxh_16bit_highbd aom_mse_wxh_16bit_highbd_c
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
+#define aom_mse_wxh_16bit_highbd aom_mse_wxh_16bit_highbd_neon
unsigned int aom_obmc_sad128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
unsigned int aom_obmc_sad128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
@@ -5630,12 +6148,6 @@ unsigned int aom_variance16x8_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance16x8 aom_variance16x8_neon
-unsigned int aom_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x2 aom_variance2x2_c
-
-unsigned int aom_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x4 aom_variance2x4_c
-
unsigned int aom_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance32x16 aom_variance32x16_neon
@@ -5656,9 +6168,6 @@ unsigned int aom_variance4x16_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x16 aom_variance4x16_neon
-unsigned int aom_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance4x2 aom_variance4x2_c
-
unsigned int aom_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x4 aom_variance4x4_neon
diff --git a/config/arm/config/av1_rtcd.h b/config/arm/config/av1_rtcd.h
index 1a3fa19ca..338b4a087 100644
--- a/config/arm/config/av1_rtcd.h
+++ b/config/arm/config/av1_rtcd.h
@@ -90,23 +90,37 @@ void aom_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Com
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
-#define aom_dist_wtd_comp_avg_upsampled_pred aom_dist_wtd_comp_avg_upsampled_pred_c
+void aom_dist_wtd_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+#define aom_dist_wtd_comp_avg_upsampled_pred aom_dist_wtd_comp_avg_upsampled_pred_neon
void aom_highbd_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
-#define aom_highbd_comp_avg_upsampled_pred aom_highbd_comp_avg_upsampled_pred_c
+void aom_highbd_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
+#define aom_highbd_comp_avg_upsampled_pred aom_highbd_comp_avg_upsampled_pred_neon
void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
-#define aom_highbd_dist_wtd_comp_avg_upsampled_pred aom_highbd_dist_wtd_comp_avg_upsampled_pred_c
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+#define aom_highbd_dist_wtd_comp_avg_upsampled_pred aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon
void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
-#define aom_highbd_upsampled_pred aom_highbd_upsampled_pred_c
+void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
+#define aom_highbd_upsampled_pred aom_highbd_upsampled_pred_neon
void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale);
void aom_quantize_b_helper_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale);
@@ -120,8 +134,8 @@ void aom_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm,
int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search);
#define aom_upsampled_pred aom_upsampled_pred_neon
-void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
-void av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+int av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+int av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_neon
void av1_apply_temporal_filter_c(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
@@ -137,17 +151,16 @@ int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, in
#define av1_block_error_lp av1_block_error_lp_neon
void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
-#define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
+void av1_build_compound_diffwtd_mask_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+#define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_neon
void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
void av1_build_compound_diffwtd_mask_d16_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
#define av1_build_compound_diffwtd_mask_d16 av1_build_compound_diffwtd_mask_d16_neon
void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
-#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c
-
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
-#define av1_calc_frame_error av1_calc_frame_error_c
+void av1_build_compound_diffwtd_mask_highbd_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_neon
void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
@@ -157,35 +170,39 @@ void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint
void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim2 av1_calc_indices_dim2_neon
-void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
-#define av1_calc_proj_params av1_calc_proj_params_c
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+#define av1_calc_proj_params av1_calc_proj_params_neon
-void av1_calc_proj_params_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
-#define av1_calc_proj_params_high_bd av1_calc_proj_params_high_bd_c
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+#define av1_calc_proj_params_high_bd av1_calc_proj_params_high_bd_neon
-void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+void av1_cnn_activate_c(float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
#define av1_cnn_activate av1_cnn_activate_c
-void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+void av1_cnn_add_c(float **input, int channels, int width, int height, int stride, const float **add);
#define av1_cnn_add av1_cnn_add_c
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
#define av1_cnn_batchnorm av1_cnn_batchnorm_c
-void av1_cnn_convolve_no_maxpool_padding_valid_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
+void av1_cnn_convolve_no_maxpool_padding_valid_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
#define av1_cnn_convolve_no_maxpool_padding_valid av1_cnn_convolve_no_maxpool_padding_valid_c
-void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
#define av1_cnn_deconvolve av1_cnn_deconvolve_c
-bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
#define av1_cnn_predict av1_cnn_predict_c
void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
-#define av1_compute_stats av1_compute_stats_c
+void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
+#define av1_compute_stats av1_compute_stats_neon
void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth);
-#define av1_compute_stats_highbd av1_compute_stats_highbd_c
+void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth);
+#define av1_compute_stats_highbd av1_compute_stats_highbd_neon
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_scale av1_convolve_2d_scale_c
@@ -194,6 +211,10 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_sr av1_convolve_2d_sr_neon
+void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_sr_intrabc av1_convolve_2d_sr_intrabc_neon
+
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
#define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
@@ -201,10 +222,18 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
#define av1_convolve_x_sr av1_convolve_x_sr_neon
+void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
+void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
+#define av1_convolve_x_sr_intrabc av1_convolve_x_sr_intrabc_neon
+
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
#define av1_convolve_y_sr av1_convolve_y_sr_neon
+void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_neon
+
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
@@ -234,13 +263,12 @@ void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, c
#define av1_dr_prediction_z3 av1_dr_prediction_z3_neon
double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, int width, int stride, int edge_thresh);
-#define av1_estimate_noise_from_single_plane av1_estimate_noise_from_single_plane_c
+double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height, int width, int stride, int edge_thresh);
+#define av1_estimate_noise_from_single_plane av1_estimate_noise_from_single_plane_neon
void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
-#define av1_filter_intra_edge av1_filter_intra_edge_c
-
-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
-#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
+void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength);
+#define av1_filter_intra_edge av1_filter_intra_edge_neon
void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
@@ -329,8 +357,8 @@ void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride);
uint32_t av1_get_crc32c_value_c(void *crc_calculator, uint8_t *p, size_t length);
#define av1_get_crc32c_value av1_get_crc32c_value_c
-void av1_get_horver_correlation_full_c( const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
-void av1_get_horver_correlation_full_neon( const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
+void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
#define av1_get_horver_correlation_full av1_get_horver_correlation_full_neon
void av1_get_nz_map_contexts_c(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts);
@@ -338,10 +366,12 @@ void av1_get_nz_map_contexts_neon(const uint8_t *const levels, const int16_t *co
#define av1_get_nz_map_contexts av1_get_nz_map_contexts_neon
void av1_highbd_apply_temporal_filter_c(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
-#define av1_highbd_apply_temporal_filter av1_highbd_apply_temporal_filter_c
+void av1_highbd_apply_temporal_filter_neon(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+#define av1_highbd_apply_temporal_filter av1_highbd_apply_temporal_filter_neon
int64_t av1_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
-#define av1_highbd_block_error av1_highbd_block_error_c
+int64_t av1_highbd_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define av1_highbd_block_error av1_highbd_block_error_neon
void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define av1_highbd_convolve8 av1_highbd_convolve8_c
@@ -360,6 +390,10 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *
void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_neon
+void av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_intrabc_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_sr_intrabc av1_highbd_convolve_2d_sr_intrabc_neon
+
void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@@ -374,10 +408,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_neon
+void av1_highbd_convolve_x_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_intrabc_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_x_sr_intrabc av1_highbd_convolve_x_sr_intrabc_neon
+
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_neon
+void av1_highbd_convolve_y_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
+void av1_highbd_convolve_y_sr_intrabc_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
+#define av1_highbd_convolve_y_sr_intrabc av1_highbd_convolve_y_sr_intrabc_neon
+
void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
void av1_highbd_dist_wtd_convolve_2d_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_neon
@@ -404,7 +446,12 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh);
-#define av1_highbd_estimate_noise_from_single_plane av1_highbd_estimate_noise_from_single_plane_c
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh);
+#define av1_highbd_estimate_noise_from_single_plane av1_highbd_estimate_noise_from_single_plane_neon
+
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength);
+void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength);
+#define av1_highbd_filter_intra_edge av1_highbd_filter_intra_edge_neon
void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
@@ -484,18 +531,24 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-int64_t av1_highbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_highbd_pixel_proj_error av1_highbd_pixel_proj_error_c
void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
void av1_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
#define av1_highbd_quantize_fp av1_highbd_quantize_fp_neon
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd);
+void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd);
+#define av1_highbd_upsample_intra_edge av1_highbd_upsample_intra_edge_neon
+
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
-#define av1_highbd_warp_affine av1_highbd_warp_affine_c
+void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+#define av1_highbd_warp_affine av1_highbd_warp_affine_neon
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
-#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_c
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
+#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_neon
void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
@@ -580,15 +633,15 @@ void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_s
void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
#define av1_lowbd_fwd_txfm av1_lowbd_fwd_txfm_neon
-int64_t av1_lowbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
-int64_t av1_lowbd_pixel_proj_error_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_lowbd_pixel_proj_error_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_lowbd_pixel_proj_error av1_lowbd_pixel_proj_error_neon
-void av1_nn_fast_softmax_16_c( const float *input_nodes, float *output);
+void av1_nn_fast_softmax_16_c(const float *input_nodes, float *output);
#define av1_nn_fast_softmax_16 av1_nn_fast_softmax_16_c
-void av1_nn_predict_c( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
-void av1_nn_predict_neon( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_c(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_neon(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
#define av1_nn_predict av1_nn_predict_neon
void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
@@ -631,27 +684,27 @@ void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width, co
#define av1_txb_init_levels av1_txb_init_levels_neon
void av1_upsample_intra_edge_c(uint8_t *p, int sz);
-#define av1_upsample_intra_edge av1_upsample_intra_edge_c
-
-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
-#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
+void av1_upsample_intra_edge_neon(uint8_t *p, int sz);
+#define av1_upsample_intra_edge av1_upsample_intra_edge_neon
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_warp_affine av1_warp_affine_neon
void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, const int16_t *b, int N);
-#define av1_wedge_compute_delta_squares av1_wedge_compute_delta_squares_c
+void av1_wedge_compute_delta_squares_neon(int16_t *d, const int16_t *a, const int16_t *b, int N);
+#define av1_wedge_compute_delta_squares av1_wedge_compute_delta_squares_neon
int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N, int64_t limit);
-#define av1_wedge_sign_from_residuals av1_wedge_sign_from_residuals_c
+int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m, int N, int64_t limit);
+#define av1_wedge_sign_from_residuals av1_wedge_sign_from_residuals_neon
uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_neon
-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
-void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
#define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_neon
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
diff --git a/config/arm64/config/aom_config.asm b/config/arm64/config/aom_config.asm
index 9214692b0..8d1117eda 100644
--- a/config/arm64/config/aom_config.asm
+++ b/config/arm64/config/aom_config.asm
@@ -1,5 +1,5 @@
;
-; Copyright (c) 2023, Alliance for Open Media. All rights reserved
+; Copyright (c) 2024, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -76,6 +76,8 @@ HAVE_AVX2 equ 0
HAVE_FEXCEPT equ 1
HAVE_MMX equ 0
HAVE_NEON equ 1
+HAVE_NEON_DOTPROD equ 0
+HAVE_NEON_I8MM equ 0
HAVE_PTHREAD_H equ 1
HAVE_SSE equ 0
HAVE_SSE2 equ 0
@@ -83,6 +85,7 @@ HAVE_SSE3 equ 0
HAVE_SSE4_1 equ 0
HAVE_SSE4_2 equ 0
HAVE_SSSE3 equ 0
+HAVE_SVE equ 0
HAVE_UNISTD_H equ 1
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
diff --git a/config/arm64/config/aom_config.c b/config/arm64/config/aom_config.c
index 0a757092e..c600ad2d5 100644
--- a/config/arm64/config/aom_config.c
+++ b/config/arm64/config/aom_config.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,5 +9,5 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "aom/aom_codec.h"
-static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/arm64-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=1 -DCONFIG_AV1_HIGHBITDEPTH=1 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
+static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/arm64-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=1 -DCONFIG_AV1_HIGHBITDEPTH=1 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_ARM_CRC32=0 -DENABLE_NEON_DOTPROD=0 -DENABLE_NEON_I8MM=0 -DENABLE_SSE4_1=0";
const char *aom_codec_build_config(void) {return cfg;}
diff --git a/config/arm64/config/aom_config.h b/config/arm64/config/aom_config.h
index 239527ce4..4782c8984 100644
--- a/config/arm64/config/aom_config.h
+++ b/config/arm64/config/aom_config.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -78,6 +78,8 @@
#define HAVE_FEXCEPT 1
#define HAVE_MMX 0
#define HAVE_NEON 1
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
#define HAVE_PTHREAD_H 1
#define HAVE_SSE 0
#define HAVE_SSE2 0
@@ -85,6 +87,7 @@
#define HAVE_SSE4_1 0
#define HAVE_SSE4_2 0
#define HAVE_SSSE3 0
+#define HAVE_SVE 0
#define HAVE_UNISTD_H 1
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
diff --git a/config/arm64/config/aom_dsp_rtcd.h b/config/arm64/config/aom_dsp_rtcd.h
index ad77b0455..bae3ed37a 100644
--- a/config/arm64/config/aom_dsp_rtcd.h
+++ b/config/arm64/config/aom_dsp_rtcd.h
@@ -39,7 +39,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *
#define aom_blend_a64_hmask aom_blend_a64_hmask_neon
void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
-#define aom_blend_a64_mask aom_blend_a64_mask_c
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
+#define aom_blend_a64_mask aom_blend_a64_mask_neon
void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
@@ -54,7 +55,8 @@ void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
#define aom_comp_mask_pred aom_comp_mask_pred_neon
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
-#define aom_compute_flow_at_point aom_compute_flow_at_point_c
+void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+#define aom_compute_flow_at_point aom_compute_flow_at_point_neon
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
@@ -376,139 +378,184 @@ void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8
#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_neon
void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_comp_avg_pred aom_dist_wtd_comp_avg_pred_c
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_comp_avg_pred aom_dist_wtd_comp_avg_pred_neon
unsigned int aom_dist_wtd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad128x128_avg aom_dist_wtd_sad128x128_avg_c
+unsigned int aom_dist_wtd_sad128x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad128x128_avg aom_dist_wtd_sad128x128_avg_neon
unsigned int aom_dist_wtd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad128x64_avg aom_dist_wtd_sad128x64_avg_c
+unsigned int aom_dist_wtd_sad128x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad128x64_avg aom_dist_wtd_sad128x64_avg_neon
unsigned int aom_dist_wtd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x16_avg aom_dist_wtd_sad16x16_avg_c
+unsigned int aom_dist_wtd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x16_avg aom_dist_wtd_sad16x16_avg_neon
unsigned int aom_dist_wtd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x32_avg aom_dist_wtd_sad16x32_avg_c
+unsigned int aom_dist_wtd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x32_avg aom_dist_wtd_sad16x32_avg_neon
unsigned int aom_dist_wtd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x4_avg aom_dist_wtd_sad16x4_avg_c
+unsigned int aom_dist_wtd_sad16x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x4_avg aom_dist_wtd_sad16x4_avg_neon
unsigned int aom_dist_wtd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x64_avg aom_dist_wtd_sad16x64_avg_c
+unsigned int aom_dist_wtd_sad16x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x64_avg aom_dist_wtd_sad16x64_avg_neon
unsigned int aom_dist_wtd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad16x8_avg aom_dist_wtd_sad16x8_avg_c
+unsigned int aom_dist_wtd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad16x8_avg aom_dist_wtd_sad16x8_avg_neon
unsigned int aom_dist_wtd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x16_avg aom_dist_wtd_sad32x16_avg_c
+unsigned int aom_dist_wtd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x16_avg aom_dist_wtd_sad32x16_avg_neon
unsigned int aom_dist_wtd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x32_avg aom_dist_wtd_sad32x32_avg_c
+unsigned int aom_dist_wtd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x32_avg aom_dist_wtd_sad32x32_avg_neon
unsigned int aom_dist_wtd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x64_avg aom_dist_wtd_sad32x64_avg_c
+unsigned int aom_dist_wtd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x64_avg aom_dist_wtd_sad32x64_avg_neon
unsigned int aom_dist_wtd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad32x8_avg aom_dist_wtd_sad32x8_avg_c
+unsigned int aom_dist_wtd_sad32x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad32x8_avg aom_dist_wtd_sad32x8_avg_neon
unsigned int aom_dist_wtd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad4x16_avg aom_dist_wtd_sad4x16_avg_c
+unsigned int aom_dist_wtd_sad4x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad4x16_avg aom_dist_wtd_sad4x16_avg_neon
unsigned int aom_dist_wtd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad4x4_avg aom_dist_wtd_sad4x4_avg_c
+unsigned int aom_dist_wtd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad4x4_avg aom_dist_wtd_sad4x4_avg_neon
unsigned int aom_dist_wtd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad4x8_avg aom_dist_wtd_sad4x8_avg_c
+unsigned int aom_dist_wtd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad4x8_avg aom_dist_wtd_sad4x8_avg_neon
unsigned int aom_dist_wtd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x128_avg aom_dist_wtd_sad64x128_avg_c
+unsigned int aom_dist_wtd_sad64x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x128_avg aom_dist_wtd_sad64x128_avg_neon
unsigned int aom_dist_wtd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x16_avg aom_dist_wtd_sad64x16_avg_c
+unsigned int aom_dist_wtd_sad64x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x16_avg aom_dist_wtd_sad64x16_avg_neon
unsigned int aom_dist_wtd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x32_avg aom_dist_wtd_sad64x32_avg_c
+unsigned int aom_dist_wtd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x32_avg aom_dist_wtd_sad64x32_avg_neon
unsigned int aom_dist_wtd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad64x64_avg aom_dist_wtd_sad64x64_avg_c
+unsigned int aom_dist_wtd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad64x64_avg aom_dist_wtd_sad64x64_avg_neon
unsigned int aom_dist_wtd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x16_avg aom_dist_wtd_sad8x16_avg_c
+unsigned int aom_dist_wtd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x16_avg aom_dist_wtd_sad8x16_avg_neon
unsigned int aom_dist_wtd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x32_avg aom_dist_wtd_sad8x32_avg_c
+unsigned int aom_dist_wtd_sad8x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x32_avg aom_dist_wtd_sad8x32_avg_neon
unsigned int aom_dist_wtd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x4_avg aom_dist_wtd_sad8x4_avg_c
+unsigned int aom_dist_wtd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x4_avg aom_dist_wtd_sad8x4_avg_neon
unsigned int aom_dist_wtd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sad8x8_avg aom_dist_wtd_sad8x8_avg_c
+unsigned int aom_dist_wtd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sad8x8_avg aom_dist_wtd_sad8x8_avg_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance128x128 aom_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance128x128 aom_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance128x64 aom_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance128x64 aom_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x16 aom_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x16 aom_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x32 aom_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x32 aom_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x4 aom_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x4 aom_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x64 aom_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x64 aom_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance16x8 aom_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance16x8 aom_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x16 aom_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x16 aom_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x32 aom_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x32 aom_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x64 aom_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x64 aom_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance32x8 aom_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance32x8 aom_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance4x16 aom_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance4x16 aom_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance4x4 aom_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance4x4 aom_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance4x8 aom_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance4x8 aom_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x128 aom_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x128 aom_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x16 aom_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x16 aom_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x32 aom_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x32 aom_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance64x64 aom_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance64x64 aom_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x16 aom_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x16 aom_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x32 aom_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x32 aom_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x4 aom_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x4 aom_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_dist_wtd_sub_pixel_avg_variance8x8 aom_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_dist_wtd_sub_pixel_avg_variance8x8 aom_dist_wtd_sub_pixel_avg_variance8x8_neon
void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
void aom_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -534,10 +581,12 @@ void aom_fft8x8_float_c(const float *input, float *temp, float *output);
#define aom_fft8x8_float aom_fft8x8_float_c
void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum);
-#define aom_get_blk_sse_sum aom_get_blk_sse_sum_c
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum);
+#define aom_get_blk_sse_sum aom_get_blk_sse_sum_neon
unsigned int aom_get_mb_ss_c(const int16_t *);
-#define aom_get_mb_ss aom_get_mb_ss_c
+unsigned int aom_get_mb_ss_neon(const int16_t *);
+#define aom_get_mb_ss aom_get_mb_ss_neon
void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16);
@@ -652,136 +701,180 @@ void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff, ptrdiff_t src_stride
#define aom_hadamard_lp_8x8_dual aom_hadamard_lp_8x8_dual_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance128x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance128x128 aom_highbd_10_masked_sub_pixel_variance128x128_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance128x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance128x128 aom_highbd_10_masked_sub_pixel_variance128x128_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance128x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance128x64 aom_highbd_10_masked_sub_pixel_variance128x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance128x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance128x64 aom_highbd_10_masked_sub_pixel_variance128x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x16 aom_highbd_10_masked_sub_pixel_variance16x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x16 aom_highbd_10_masked_sub_pixel_variance16x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x32 aom_highbd_10_masked_sub_pixel_variance16x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x32 aom_highbd_10_masked_sub_pixel_variance16x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x4 aom_highbd_10_masked_sub_pixel_variance16x4_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x4 aom_highbd_10_masked_sub_pixel_variance16x4_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x64 aom_highbd_10_masked_sub_pixel_variance16x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x64 aom_highbd_10_masked_sub_pixel_variance16x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance16x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance16x8 aom_highbd_10_masked_sub_pixel_variance16x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance16x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance16x8 aom_highbd_10_masked_sub_pixel_variance16x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x16 aom_highbd_10_masked_sub_pixel_variance32x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x16 aom_highbd_10_masked_sub_pixel_variance32x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x32 aom_highbd_10_masked_sub_pixel_variance32x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x32 aom_highbd_10_masked_sub_pixel_variance32x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x64 aom_highbd_10_masked_sub_pixel_variance32x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x64 aom_highbd_10_masked_sub_pixel_variance32x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance32x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance32x8 aom_highbd_10_masked_sub_pixel_variance32x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance32x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance32x8 aom_highbd_10_masked_sub_pixel_variance32x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance4x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance4x16 aom_highbd_10_masked_sub_pixel_variance4x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance4x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance4x16 aom_highbd_10_masked_sub_pixel_variance4x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance4x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance4x4 aom_highbd_10_masked_sub_pixel_variance4x4_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance4x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance4x4 aom_highbd_10_masked_sub_pixel_variance4x4_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance4x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance4x8 aom_highbd_10_masked_sub_pixel_variance4x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance4x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance4x8 aom_highbd_10_masked_sub_pixel_variance4x8_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x128 aom_highbd_10_masked_sub_pixel_variance64x128_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x128 aom_highbd_10_masked_sub_pixel_variance64x128_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x16 aom_highbd_10_masked_sub_pixel_variance64x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x16 aom_highbd_10_masked_sub_pixel_variance64x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x32 aom_highbd_10_masked_sub_pixel_variance64x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x32 aom_highbd_10_masked_sub_pixel_variance64x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance64x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance64x64 aom_highbd_10_masked_sub_pixel_variance64x64_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance64x64 aom_highbd_10_masked_sub_pixel_variance64x64_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x16 aom_highbd_10_masked_sub_pixel_variance8x16_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x16 aom_highbd_10_masked_sub_pixel_variance8x16_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x32 aom_highbd_10_masked_sub_pixel_variance8x32_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x32 aom_highbd_10_masked_sub_pixel_variance8x32_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x4 aom_highbd_10_masked_sub_pixel_variance8x4_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x4 aom_highbd_10_masked_sub_pixel_variance8x4_neon
unsigned int aom_highbd_10_masked_sub_pixel_variance8x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_10_masked_sub_pixel_variance8x8 aom_highbd_10_masked_sub_pixel_variance8x8_c
+unsigned int aom_highbd_10_masked_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_10_masked_sub_pixel_variance8x8 aom_highbd_10_masked_sub_pixel_variance8x8_neon
unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -800,497 +893,620 @@ unsigned int aom_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int source_strid
#define aom_highbd_10_mse8x8 aom_highbd_10_mse8x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance128x128 aom_highbd_10_obmc_sub_pixel_variance128x128_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance128x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance128x128 aom_highbd_10_obmc_sub_pixel_variance128x128_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance128x64 aom_highbd_10_obmc_sub_pixel_variance128x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance128x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance128x64 aom_highbd_10_obmc_sub_pixel_variance128x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x16 aom_highbd_10_obmc_sub_pixel_variance16x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x16 aom_highbd_10_obmc_sub_pixel_variance16x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x32 aom_highbd_10_obmc_sub_pixel_variance16x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x32 aom_highbd_10_obmc_sub_pixel_variance16x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x4 aom_highbd_10_obmc_sub_pixel_variance16x4_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x4 aom_highbd_10_obmc_sub_pixel_variance16x4_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x64 aom_highbd_10_obmc_sub_pixel_variance16x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x64 aom_highbd_10_obmc_sub_pixel_variance16x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance16x8 aom_highbd_10_obmc_sub_pixel_variance16x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance16x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance16x8 aom_highbd_10_obmc_sub_pixel_variance16x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x16 aom_highbd_10_obmc_sub_pixel_variance32x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x16 aom_highbd_10_obmc_sub_pixel_variance32x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x32 aom_highbd_10_obmc_sub_pixel_variance32x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x32 aom_highbd_10_obmc_sub_pixel_variance32x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x64 aom_highbd_10_obmc_sub_pixel_variance32x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x64 aom_highbd_10_obmc_sub_pixel_variance32x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance32x8 aom_highbd_10_obmc_sub_pixel_variance32x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance32x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance32x8 aom_highbd_10_obmc_sub_pixel_variance32x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance4x16 aom_highbd_10_obmc_sub_pixel_variance4x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance4x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance4x16 aom_highbd_10_obmc_sub_pixel_variance4x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance4x4 aom_highbd_10_obmc_sub_pixel_variance4x4_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance4x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance4x4 aom_highbd_10_obmc_sub_pixel_variance4x4_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance4x8 aom_highbd_10_obmc_sub_pixel_variance4x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance4x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance4x8 aom_highbd_10_obmc_sub_pixel_variance4x8_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x128 aom_highbd_10_obmc_sub_pixel_variance64x128_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x128 aom_highbd_10_obmc_sub_pixel_variance64x128_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x16 aom_highbd_10_obmc_sub_pixel_variance64x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x16 aom_highbd_10_obmc_sub_pixel_variance64x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x32 aom_highbd_10_obmc_sub_pixel_variance64x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x32 aom_highbd_10_obmc_sub_pixel_variance64x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance64x64 aom_highbd_10_obmc_sub_pixel_variance64x64_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance64x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance64x64 aom_highbd_10_obmc_sub_pixel_variance64x64_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x16 aom_highbd_10_obmc_sub_pixel_variance8x16_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x16 aom_highbd_10_obmc_sub_pixel_variance8x16_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x32 aom_highbd_10_obmc_sub_pixel_variance8x32_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x32 aom_highbd_10_obmc_sub_pixel_variance8x32_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x4 aom_highbd_10_obmc_sub_pixel_variance8x4_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x4 aom_highbd_10_obmc_sub_pixel_variance8x4_neon
unsigned int aom_highbd_10_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_sub_pixel_variance8x8 aom_highbd_10_obmc_sub_pixel_variance8x8_c
+unsigned int aom_highbd_10_obmc_sub_pixel_variance8x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_sub_pixel_variance8x8 aom_highbd_10_obmc_sub_pixel_variance8x8_neon
unsigned int aom_highbd_10_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance128x128 aom_highbd_10_obmc_variance128x128_c
+unsigned int aom_highbd_10_obmc_variance128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance128x128 aom_highbd_10_obmc_variance128x128_neon
unsigned int aom_highbd_10_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance128x64 aom_highbd_10_obmc_variance128x64_c
+unsigned int aom_highbd_10_obmc_variance128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance128x64 aom_highbd_10_obmc_variance128x64_neon
unsigned int aom_highbd_10_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x16 aom_highbd_10_obmc_variance16x16_c
+unsigned int aom_highbd_10_obmc_variance16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x16 aom_highbd_10_obmc_variance16x16_neon
unsigned int aom_highbd_10_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x32 aom_highbd_10_obmc_variance16x32_c
+unsigned int aom_highbd_10_obmc_variance16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x32 aom_highbd_10_obmc_variance16x32_neon
unsigned int aom_highbd_10_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x4 aom_highbd_10_obmc_variance16x4_c
+unsigned int aom_highbd_10_obmc_variance16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x4 aom_highbd_10_obmc_variance16x4_neon
unsigned int aom_highbd_10_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x64 aom_highbd_10_obmc_variance16x64_c
+unsigned int aom_highbd_10_obmc_variance16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x64 aom_highbd_10_obmc_variance16x64_neon
unsigned int aom_highbd_10_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance16x8 aom_highbd_10_obmc_variance16x8_c
+unsigned int aom_highbd_10_obmc_variance16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance16x8 aom_highbd_10_obmc_variance16x8_neon
unsigned int aom_highbd_10_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x16 aom_highbd_10_obmc_variance32x16_c
+unsigned int aom_highbd_10_obmc_variance32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x16 aom_highbd_10_obmc_variance32x16_neon
unsigned int aom_highbd_10_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x32 aom_highbd_10_obmc_variance32x32_c
+unsigned int aom_highbd_10_obmc_variance32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x32 aom_highbd_10_obmc_variance32x32_neon
unsigned int aom_highbd_10_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x64 aom_highbd_10_obmc_variance32x64_c
+unsigned int aom_highbd_10_obmc_variance32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x64 aom_highbd_10_obmc_variance32x64_neon
unsigned int aom_highbd_10_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance32x8 aom_highbd_10_obmc_variance32x8_c
+unsigned int aom_highbd_10_obmc_variance32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance32x8 aom_highbd_10_obmc_variance32x8_neon
unsigned int aom_highbd_10_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance4x16 aom_highbd_10_obmc_variance4x16_c
+unsigned int aom_highbd_10_obmc_variance4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance4x16 aom_highbd_10_obmc_variance4x16_neon
unsigned int aom_highbd_10_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance4x4 aom_highbd_10_obmc_variance4x4_c
+unsigned int aom_highbd_10_obmc_variance4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance4x4 aom_highbd_10_obmc_variance4x4_neon
unsigned int aom_highbd_10_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance4x8 aom_highbd_10_obmc_variance4x8_c
+unsigned int aom_highbd_10_obmc_variance4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance4x8 aom_highbd_10_obmc_variance4x8_neon
unsigned int aom_highbd_10_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x128 aom_highbd_10_obmc_variance64x128_c
+unsigned int aom_highbd_10_obmc_variance64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x128 aom_highbd_10_obmc_variance64x128_neon
unsigned int aom_highbd_10_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x16 aom_highbd_10_obmc_variance64x16_c
+unsigned int aom_highbd_10_obmc_variance64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x16 aom_highbd_10_obmc_variance64x16_neon
unsigned int aom_highbd_10_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x32 aom_highbd_10_obmc_variance64x32_c
+unsigned int aom_highbd_10_obmc_variance64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x32 aom_highbd_10_obmc_variance64x32_neon
unsigned int aom_highbd_10_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance64x64 aom_highbd_10_obmc_variance64x64_c
+unsigned int aom_highbd_10_obmc_variance64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance64x64 aom_highbd_10_obmc_variance64x64_neon
unsigned int aom_highbd_10_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x16 aom_highbd_10_obmc_variance8x16_c
+unsigned int aom_highbd_10_obmc_variance8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x16 aom_highbd_10_obmc_variance8x16_neon
unsigned int aom_highbd_10_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x32 aom_highbd_10_obmc_variance8x32_c
+unsigned int aom_highbd_10_obmc_variance8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x32 aom_highbd_10_obmc_variance8x32_neon
unsigned int aom_highbd_10_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x4 aom_highbd_10_obmc_variance8x4_c
+unsigned int aom_highbd_10_obmc_variance8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x4 aom_highbd_10_obmc_variance8x4_neon
unsigned int aom_highbd_10_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_10_obmc_variance8x8 aom_highbd_10_obmc_variance8x8_c
+unsigned int aom_highbd_10_obmc_variance8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_10_obmc_variance8x8 aom_highbd_10_obmc_variance8x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance128x128 aom_highbd_10_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance128x128 aom_highbd_10_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance128x64 aom_highbd_10_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance128x64 aom_highbd_10_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x16 aom_highbd_10_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x16 aom_highbd_10_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x32 aom_highbd_10_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x32 aom_highbd_10_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x4 aom_highbd_10_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x4 aom_highbd_10_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x64 aom_highbd_10_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x64 aom_highbd_10_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance16x8 aom_highbd_10_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance16x8 aom_highbd_10_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x16 aom_highbd_10_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x16 aom_highbd_10_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x32 aom_highbd_10_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x32 aom_highbd_10_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x64 aom_highbd_10_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x64 aom_highbd_10_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance32x8 aom_highbd_10_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance32x8 aom_highbd_10_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance4x16 aom_highbd_10_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance4x16 aom_highbd_10_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance4x4 aom_highbd_10_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance4x4 aom_highbd_10_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance4x8 aom_highbd_10_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance4x8 aom_highbd_10_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x128 aom_highbd_10_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x128 aom_highbd_10_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x16 aom_highbd_10_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x16 aom_highbd_10_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x32 aom_highbd_10_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x32 aom_highbd_10_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance64x64 aom_highbd_10_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance64x64 aom_highbd_10_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x16 aom_highbd_10_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x16 aom_highbd_10_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x32 aom_highbd_10_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x32 aom_highbd_10_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x4 aom_highbd_10_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x4 aom_highbd_10_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_10_sub_pixel_avg_variance8x8 aom_highbd_10_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_10_sub_pixel_avg_variance8x8 aom_highbd_10_sub_pixel_avg_variance8x8_neon
uint32_t aom_highbd_10_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance128x128 aom_highbd_10_sub_pixel_variance128x128_c
+uint32_t aom_highbd_10_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance128x128 aom_highbd_10_sub_pixel_variance128x128_neon
uint32_t aom_highbd_10_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance128x64 aom_highbd_10_sub_pixel_variance128x64_c
+uint32_t aom_highbd_10_sub_pixel_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance128x64 aom_highbd_10_sub_pixel_variance128x64_neon
uint32_t aom_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x16 aom_highbd_10_sub_pixel_variance16x16_c
+uint32_t aom_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x16 aom_highbd_10_sub_pixel_variance16x16_neon
uint32_t aom_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x32 aom_highbd_10_sub_pixel_variance16x32_c
+uint32_t aom_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x32 aom_highbd_10_sub_pixel_variance16x32_neon
uint32_t aom_highbd_10_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x4 aom_highbd_10_sub_pixel_variance16x4_c
+uint32_t aom_highbd_10_sub_pixel_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x4 aom_highbd_10_sub_pixel_variance16x4_neon
uint32_t aom_highbd_10_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x64 aom_highbd_10_sub_pixel_variance16x64_c
+uint32_t aom_highbd_10_sub_pixel_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x64 aom_highbd_10_sub_pixel_variance16x64_neon
uint32_t aom_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance16x8 aom_highbd_10_sub_pixel_variance16x8_c
+uint32_t aom_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance16x8 aom_highbd_10_sub_pixel_variance16x8_neon
uint32_t aom_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x16 aom_highbd_10_sub_pixel_variance32x16_c
+uint32_t aom_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x16 aom_highbd_10_sub_pixel_variance32x16_neon
uint32_t aom_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x32 aom_highbd_10_sub_pixel_variance32x32_c
+uint32_t aom_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x32 aom_highbd_10_sub_pixel_variance32x32_neon
uint32_t aom_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x64 aom_highbd_10_sub_pixel_variance32x64_c
+uint32_t aom_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x64 aom_highbd_10_sub_pixel_variance32x64_neon
uint32_t aom_highbd_10_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance32x8 aom_highbd_10_sub_pixel_variance32x8_c
+uint32_t aom_highbd_10_sub_pixel_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance32x8 aom_highbd_10_sub_pixel_variance32x8_neon
uint32_t aom_highbd_10_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance4x16 aom_highbd_10_sub_pixel_variance4x16_c
+uint32_t aom_highbd_10_sub_pixel_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance4x16 aom_highbd_10_sub_pixel_variance4x16_neon
uint32_t aom_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance4x4 aom_highbd_10_sub_pixel_variance4x4_c
+uint32_t aom_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance4x4 aom_highbd_10_sub_pixel_variance4x4_neon
uint32_t aom_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance4x8 aom_highbd_10_sub_pixel_variance4x8_c
+uint32_t aom_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance4x8 aom_highbd_10_sub_pixel_variance4x8_neon
uint32_t aom_highbd_10_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x128 aom_highbd_10_sub_pixel_variance64x128_c
+uint32_t aom_highbd_10_sub_pixel_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x128 aom_highbd_10_sub_pixel_variance64x128_neon
uint32_t aom_highbd_10_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x16 aom_highbd_10_sub_pixel_variance64x16_c
+uint32_t aom_highbd_10_sub_pixel_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x16 aom_highbd_10_sub_pixel_variance64x16_neon
uint32_t aom_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x32 aom_highbd_10_sub_pixel_variance64x32_c
+uint32_t aom_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x32 aom_highbd_10_sub_pixel_variance64x32_neon
uint32_t aom_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance64x64 aom_highbd_10_sub_pixel_variance64x64_c
+uint32_t aom_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance64x64 aom_highbd_10_sub_pixel_variance64x64_neon
uint32_t aom_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x16 aom_highbd_10_sub_pixel_variance8x16_c
+uint32_t aom_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x16 aom_highbd_10_sub_pixel_variance8x16_neon
uint32_t aom_highbd_10_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x32 aom_highbd_10_sub_pixel_variance8x32_c
+uint32_t aom_highbd_10_sub_pixel_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x32 aom_highbd_10_sub_pixel_variance8x32_neon
uint32_t aom_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x4 aom_highbd_10_sub_pixel_variance8x4_c
+uint32_t aom_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x4 aom_highbd_10_sub_pixel_variance8x4_neon
uint32_t aom_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_10_sub_pixel_variance8x8 aom_highbd_10_sub_pixel_variance8x8_c
+uint32_t aom_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_10_sub_pixel_variance8x8 aom_highbd_10_sub_pixel_variance8x8_neon
-unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x128 aom_highbd_10_variance128x128_neon
-unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x64 aom_highbd_10_variance128x64_neon
-unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x16 aom_highbd_10_variance16x16_neon
-unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x32 aom_highbd_10_variance16x32_neon
-unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x4 aom_highbd_10_variance16x4_neon
-unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x64 aom_highbd_10_variance16x64_neon
-unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x8 aom_highbd_10_variance16x8_neon
-unsigned int aom_highbd_10_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x2 aom_highbd_10_variance2x2_c
-
-unsigned int aom_highbd_10_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x4 aom_highbd_10_variance2x4_c
-
-unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x16 aom_highbd_10_variance32x16_neon
-unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x32 aom_highbd_10_variance32x32_neon
-unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x64 aom_highbd_10_variance32x64_neon
-unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x8 aom_highbd_10_variance32x8_neon
-unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x16 aom_highbd_10_variance4x16_neon
-unsigned int aom_highbd_10_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance4x2 aom_highbd_10_variance4x2_c
-
-unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x4 aom_highbd_10_variance4x4_neon
-unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x8 aom_highbd_10_variance4x8_neon
-unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x128 aom_highbd_10_variance64x128_neon
-unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x16 aom_highbd_10_variance64x16_neon
-unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x32 aom_highbd_10_variance64x32_neon
-unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x64 aom_highbd_10_variance64x64_neon
-unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x16 aom_highbd_10_variance8x16_neon
-unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x32 aom_highbd_10_variance8x32_neon
-unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x4 aom_highbd_10_variance8x4_neon
-unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x8 aom_highbd_10_variance8x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance128x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance128x128 aom_highbd_12_masked_sub_pixel_variance128x128_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance128x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance128x128 aom_highbd_12_masked_sub_pixel_variance128x128_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance128x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance128x64 aom_highbd_12_masked_sub_pixel_variance128x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance128x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance128x64 aom_highbd_12_masked_sub_pixel_variance128x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x16 aom_highbd_12_masked_sub_pixel_variance16x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x16 aom_highbd_12_masked_sub_pixel_variance16x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x32 aom_highbd_12_masked_sub_pixel_variance16x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x32 aom_highbd_12_masked_sub_pixel_variance16x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x4 aom_highbd_12_masked_sub_pixel_variance16x4_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x4 aom_highbd_12_masked_sub_pixel_variance16x4_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x64 aom_highbd_12_masked_sub_pixel_variance16x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x64 aom_highbd_12_masked_sub_pixel_variance16x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance16x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance16x8 aom_highbd_12_masked_sub_pixel_variance16x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance16x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance16x8 aom_highbd_12_masked_sub_pixel_variance16x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x16 aom_highbd_12_masked_sub_pixel_variance32x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x16 aom_highbd_12_masked_sub_pixel_variance32x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x32 aom_highbd_12_masked_sub_pixel_variance32x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x32 aom_highbd_12_masked_sub_pixel_variance32x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x64 aom_highbd_12_masked_sub_pixel_variance32x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x64 aom_highbd_12_masked_sub_pixel_variance32x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance32x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance32x8 aom_highbd_12_masked_sub_pixel_variance32x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance32x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance32x8 aom_highbd_12_masked_sub_pixel_variance32x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance4x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance4x16 aom_highbd_12_masked_sub_pixel_variance4x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance4x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance4x16 aom_highbd_12_masked_sub_pixel_variance4x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance4x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance4x4 aom_highbd_12_masked_sub_pixel_variance4x4_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance4x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance4x4 aom_highbd_12_masked_sub_pixel_variance4x4_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance4x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance4x8 aom_highbd_12_masked_sub_pixel_variance4x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance4x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance4x8 aom_highbd_12_masked_sub_pixel_variance4x8_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x128 aom_highbd_12_masked_sub_pixel_variance64x128_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x128 aom_highbd_12_masked_sub_pixel_variance64x128_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x16 aom_highbd_12_masked_sub_pixel_variance64x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x16 aom_highbd_12_masked_sub_pixel_variance64x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x32 aom_highbd_12_masked_sub_pixel_variance64x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x32 aom_highbd_12_masked_sub_pixel_variance64x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance64x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance64x64 aom_highbd_12_masked_sub_pixel_variance64x64_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance64x64 aom_highbd_12_masked_sub_pixel_variance64x64_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x16 aom_highbd_12_masked_sub_pixel_variance8x16_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x16 aom_highbd_12_masked_sub_pixel_variance8x16_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x32 aom_highbd_12_masked_sub_pixel_variance8x32_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x32 aom_highbd_12_masked_sub_pixel_variance8x32_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x4 aom_highbd_12_masked_sub_pixel_variance8x4_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x4 aom_highbd_12_masked_sub_pixel_variance8x4_neon
unsigned int aom_highbd_12_masked_sub_pixel_variance8x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_12_masked_sub_pixel_variance8x8 aom_highbd_12_masked_sub_pixel_variance8x8_c
+unsigned int aom_highbd_12_masked_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_12_masked_sub_pixel_variance8x8 aom_highbd_12_masked_sub_pixel_variance8x8_neon
unsigned int aom_highbd_12_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -1309,497 +1525,620 @@ unsigned int aom_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int source_strid
#define aom_highbd_12_mse8x8 aom_highbd_12_mse8x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance128x128 aom_highbd_12_obmc_sub_pixel_variance128x128_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance128x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance128x128 aom_highbd_12_obmc_sub_pixel_variance128x128_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance128x64 aom_highbd_12_obmc_sub_pixel_variance128x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance128x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance128x64 aom_highbd_12_obmc_sub_pixel_variance128x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x16 aom_highbd_12_obmc_sub_pixel_variance16x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x16 aom_highbd_12_obmc_sub_pixel_variance16x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x32 aom_highbd_12_obmc_sub_pixel_variance16x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x32 aom_highbd_12_obmc_sub_pixel_variance16x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x4 aom_highbd_12_obmc_sub_pixel_variance16x4_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x4 aom_highbd_12_obmc_sub_pixel_variance16x4_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x64 aom_highbd_12_obmc_sub_pixel_variance16x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x64 aom_highbd_12_obmc_sub_pixel_variance16x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance16x8 aom_highbd_12_obmc_sub_pixel_variance16x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance16x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance16x8 aom_highbd_12_obmc_sub_pixel_variance16x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x16 aom_highbd_12_obmc_sub_pixel_variance32x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x16 aom_highbd_12_obmc_sub_pixel_variance32x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x32 aom_highbd_12_obmc_sub_pixel_variance32x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x32 aom_highbd_12_obmc_sub_pixel_variance32x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x64 aom_highbd_12_obmc_sub_pixel_variance32x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x64 aom_highbd_12_obmc_sub_pixel_variance32x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance32x8 aom_highbd_12_obmc_sub_pixel_variance32x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance32x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance32x8 aom_highbd_12_obmc_sub_pixel_variance32x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance4x16 aom_highbd_12_obmc_sub_pixel_variance4x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance4x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance4x16 aom_highbd_12_obmc_sub_pixel_variance4x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance4x4 aom_highbd_12_obmc_sub_pixel_variance4x4_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance4x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance4x4 aom_highbd_12_obmc_sub_pixel_variance4x4_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance4x8 aom_highbd_12_obmc_sub_pixel_variance4x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance4x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance4x8 aom_highbd_12_obmc_sub_pixel_variance4x8_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x128 aom_highbd_12_obmc_sub_pixel_variance64x128_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x128 aom_highbd_12_obmc_sub_pixel_variance64x128_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x16 aom_highbd_12_obmc_sub_pixel_variance64x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x16 aom_highbd_12_obmc_sub_pixel_variance64x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x32 aom_highbd_12_obmc_sub_pixel_variance64x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x32 aom_highbd_12_obmc_sub_pixel_variance64x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance64x64 aom_highbd_12_obmc_sub_pixel_variance64x64_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance64x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance64x64 aom_highbd_12_obmc_sub_pixel_variance64x64_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x16 aom_highbd_12_obmc_sub_pixel_variance8x16_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x16 aom_highbd_12_obmc_sub_pixel_variance8x16_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x32 aom_highbd_12_obmc_sub_pixel_variance8x32_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x32 aom_highbd_12_obmc_sub_pixel_variance8x32_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x4 aom_highbd_12_obmc_sub_pixel_variance8x4_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x4 aom_highbd_12_obmc_sub_pixel_variance8x4_neon
unsigned int aom_highbd_12_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_sub_pixel_variance8x8 aom_highbd_12_obmc_sub_pixel_variance8x8_c
+unsigned int aom_highbd_12_obmc_sub_pixel_variance8x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_sub_pixel_variance8x8 aom_highbd_12_obmc_sub_pixel_variance8x8_neon
unsigned int aom_highbd_12_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance128x128 aom_highbd_12_obmc_variance128x128_c
+unsigned int aom_highbd_12_obmc_variance128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance128x128 aom_highbd_12_obmc_variance128x128_neon
unsigned int aom_highbd_12_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance128x64 aom_highbd_12_obmc_variance128x64_c
+unsigned int aom_highbd_12_obmc_variance128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance128x64 aom_highbd_12_obmc_variance128x64_neon
unsigned int aom_highbd_12_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x16 aom_highbd_12_obmc_variance16x16_c
+unsigned int aom_highbd_12_obmc_variance16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x16 aom_highbd_12_obmc_variance16x16_neon
unsigned int aom_highbd_12_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x32 aom_highbd_12_obmc_variance16x32_c
+unsigned int aom_highbd_12_obmc_variance16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x32 aom_highbd_12_obmc_variance16x32_neon
unsigned int aom_highbd_12_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x4 aom_highbd_12_obmc_variance16x4_c
+unsigned int aom_highbd_12_obmc_variance16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x4 aom_highbd_12_obmc_variance16x4_neon
unsigned int aom_highbd_12_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x64 aom_highbd_12_obmc_variance16x64_c
+unsigned int aom_highbd_12_obmc_variance16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x64 aom_highbd_12_obmc_variance16x64_neon
unsigned int aom_highbd_12_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance16x8 aom_highbd_12_obmc_variance16x8_c
+unsigned int aom_highbd_12_obmc_variance16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance16x8 aom_highbd_12_obmc_variance16x8_neon
unsigned int aom_highbd_12_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x16 aom_highbd_12_obmc_variance32x16_c
+unsigned int aom_highbd_12_obmc_variance32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x16 aom_highbd_12_obmc_variance32x16_neon
unsigned int aom_highbd_12_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x32 aom_highbd_12_obmc_variance32x32_c
+unsigned int aom_highbd_12_obmc_variance32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x32 aom_highbd_12_obmc_variance32x32_neon
unsigned int aom_highbd_12_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x64 aom_highbd_12_obmc_variance32x64_c
+unsigned int aom_highbd_12_obmc_variance32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x64 aom_highbd_12_obmc_variance32x64_neon
unsigned int aom_highbd_12_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance32x8 aom_highbd_12_obmc_variance32x8_c
+unsigned int aom_highbd_12_obmc_variance32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance32x8 aom_highbd_12_obmc_variance32x8_neon
unsigned int aom_highbd_12_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance4x16 aom_highbd_12_obmc_variance4x16_c
+unsigned int aom_highbd_12_obmc_variance4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance4x16 aom_highbd_12_obmc_variance4x16_neon
unsigned int aom_highbd_12_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance4x4 aom_highbd_12_obmc_variance4x4_c
+unsigned int aom_highbd_12_obmc_variance4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance4x4 aom_highbd_12_obmc_variance4x4_neon
unsigned int aom_highbd_12_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance4x8 aom_highbd_12_obmc_variance4x8_c
+unsigned int aom_highbd_12_obmc_variance4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance4x8 aom_highbd_12_obmc_variance4x8_neon
unsigned int aom_highbd_12_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x128 aom_highbd_12_obmc_variance64x128_c
+unsigned int aom_highbd_12_obmc_variance64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x128 aom_highbd_12_obmc_variance64x128_neon
unsigned int aom_highbd_12_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x16 aom_highbd_12_obmc_variance64x16_c
+unsigned int aom_highbd_12_obmc_variance64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x16 aom_highbd_12_obmc_variance64x16_neon
unsigned int aom_highbd_12_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x32 aom_highbd_12_obmc_variance64x32_c
+unsigned int aom_highbd_12_obmc_variance64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x32 aom_highbd_12_obmc_variance64x32_neon
unsigned int aom_highbd_12_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance64x64 aom_highbd_12_obmc_variance64x64_c
+unsigned int aom_highbd_12_obmc_variance64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance64x64 aom_highbd_12_obmc_variance64x64_neon
unsigned int aom_highbd_12_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x16 aom_highbd_12_obmc_variance8x16_c
+unsigned int aom_highbd_12_obmc_variance8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x16 aom_highbd_12_obmc_variance8x16_neon
unsigned int aom_highbd_12_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x32 aom_highbd_12_obmc_variance8x32_c
+unsigned int aom_highbd_12_obmc_variance8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x32 aom_highbd_12_obmc_variance8x32_neon
unsigned int aom_highbd_12_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x4 aom_highbd_12_obmc_variance8x4_c
+unsigned int aom_highbd_12_obmc_variance8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x4 aom_highbd_12_obmc_variance8x4_neon
unsigned int aom_highbd_12_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_12_obmc_variance8x8 aom_highbd_12_obmc_variance8x8_c
+unsigned int aom_highbd_12_obmc_variance8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_12_obmc_variance8x8 aom_highbd_12_obmc_variance8x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance128x128 aom_highbd_12_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance128x128 aom_highbd_12_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance128x64 aom_highbd_12_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance128x64 aom_highbd_12_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x16 aom_highbd_12_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x16 aom_highbd_12_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x32 aom_highbd_12_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x32 aom_highbd_12_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x4 aom_highbd_12_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x4 aom_highbd_12_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x64 aom_highbd_12_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x64 aom_highbd_12_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance16x8 aom_highbd_12_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance16x8 aom_highbd_12_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x16 aom_highbd_12_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x16 aom_highbd_12_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x32 aom_highbd_12_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x32 aom_highbd_12_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x64 aom_highbd_12_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x64 aom_highbd_12_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance32x8 aom_highbd_12_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance32x8 aom_highbd_12_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance4x16 aom_highbd_12_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance4x16 aom_highbd_12_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance4x4 aom_highbd_12_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance4x4 aom_highbd_12_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance4x8 aom_highbd_12_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance4x8 aom_highbd_12_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x128 aom_highbd_12_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x128 aom_highbd_12_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x16 aom_highbd_12_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x16 aom_highbd_12_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x32 aom_highbd_12_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x32 aom_highbd_12_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance64x64 aom_highbd_12_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance64x64 aom_highbd_12_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x16 aom_highbd_12_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x16 aom_highbd_12_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x32 aom_highbd_12_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x32 aom_highbd_12_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x4 aom_highbd_12_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x4 aom_highbd_12_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_12_sub_pixel_avg_variance8x8 aom_highbd_12_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_12_sub_pixel_avg_variance8x8 aom_highbd_12_sub_pixel_avg_variance8x8_neon
uint32_t aom_highbd_12_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance128x128 aom_highbd_12_sub_pixel_variance128x128_c
+uint32_t aom_highbd_12_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance128x128 aom_highbd_12_sub_pixel_variance128x128_neon
uint32_t aom_highbd_12_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance128x64 aom_highbd_12_sub_pixel_variance128x64_c
+uint32_t aom_highbd_12_sub_pixel_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance128x64 aom_highbd_12_sub_pixel_variance128x64_neon
uint32_t aom_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x16 aom_highbd_12_sub_pixel_variance16x16_c
+uint32_t aom_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x16 aom_highbd_12_sub_pixel_variance16x16_neon
uint32_t aom_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x32 aom_highbd_12_sub_pixel_variance16x32_c
+uint32_t aom_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x32 aom_highbd_12_sub_pixel_variance16x32_neon
uint32_t aom_highbd_12_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x4 aom_highbd_12_sub_pixel_variance16x4_c
+uint32_t aom_highbd_12_sub_pixel_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x4 aom_highbd_12_sub_pixel_variance16x4_neon
uint32_t aom_highbd_12_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x64 aom_highbd_12_sub_pixel_variance16x64_c
+uint32_t aom_highbd_12_sub_pixel_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x64 aom_highbd_12_sub_pixel_variance16x64_neon
uint32_t aom_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance16x8 aom_highbd_12_sub_pixel_variance16x8_c
+uint32_t aom_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance16x8 aom_highbd_12_sub_pixel_variance16x8_neon
uint32_t aom_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x16 aom_highbd_12_sub_pixel_variance32x16_c
+uint32_t aom_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x16 aom_highbd_12_sub_pixel_variance32x16_neon
uint32_t aom_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x32 aom_highbd_12_sub_pixel_variance32x32_c
+uint32_t aom_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x32 aom_highbd_12_sub_pixel_variance32x32_neon
uint32_t aom_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x64 aom_highbd_12_sub_pixel_variance32x64_c
+uint32_t aom_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x64 aom_highbd_12_sub_pixel_variance32x64_neon
uint32_t aom_highbd_12_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance32x8 aom_highbd_12_sub_pixel_variance32x8_c
+uint32_t aom_highbd_12_sub_pixel_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance32x8 aom_highbd_12_sub_pixel_variance32x8_neon
uint32_t aom_highbd_12_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance4x16 aom_highbd_12_sub_pixel_variance4x16_c
+uint32_t aom_highbd_12_sub_pixel_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance4x16 aom_highbd_12_sub_pixel_variance4x16_neon
uint32_t aom_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance4x4 aom_highbd_12_sub_pixel_variance4x4_c
+uint32_t aom_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance4x4 aom_highbd_12_sub_pixel_variance4x4_neon
uint32_t aom_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance4x8 aom_highbd_12_sub_pixel_variance4x8_c
+uint32_t aom_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance4x8 aom_highbd_12_sub_pixel_variance4x8_neon
uint32_t aom_highbd_12_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x128 aom_highbd_12_sub_pixel_variance64x128_c
+uint32_t aom_highbd_12_sub_pixel_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x128 aom_highbd_12_sub_pixel_variance64x128_neon
uint32_t aom_highbd_12_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x16 aom_highbd_12_sub_pixel_variance64x16_c
+uint32_t aom_highbd_12_sub_pixel_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x16 aom_highbd_12_sub_pixel_variance64x16_neon
uint32_t aom_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x32 aom_highbd_12_sub_pixel_variance64x32_c
+uint32_t aom_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x32 aom_highbd_12_sub_pixel_variance64x32_neon
uint32_t aom_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance64x64 aom_highbd_12_sub_pixel_variance64x64_c
+uint32_t aom_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance64x64 aom_highbd_12_sub_pixel_variance64x64_neon
uint32_t aom_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x16 aom_highbd_12_sub_pixel_variance8x16_c
+uint32_t aom_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x16 aom_highbd_12_sub_pixel_variance8x16_neon
uint32_t aom_highbd_12_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x32 aom_highbd_12_sub_pixel_variance8x32_c
+uint32_t aom_highbd_12_sub_pixel_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x32 aom_highbd_12_sub_pixel_variance8x32_neon
uint32_t aom_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x4 aom_highbd_12_sub_pixel_variance8x4_c
+uint32_t aom_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x4 aom_highbd_12_sub_pixel_variance8x4_neon
uint32_t aom_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_12_sub_pixel_variance8x8 aom_highbd_12_sub_pixel_variance8x8_c
+uint32_t aom_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_12_sub_pixel_variance8x8 aom_highbd_12_sub_pixel_variance8x8_neon
-unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x128 aom_highbd_12_variance128x128_neon
-unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x64 aom_highbd_12_variance128x64_neon
-unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x16 aom_highbd_12_variance16x16_neon
-unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x32 aom_highbd_12_variance16x32_neon
-unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x4 aom_highbd_12_variance16x4_neon
-unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x64 aom_highbd_12_variance16x64_neon
-unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x8 aom_highbd_12_variance16x8_neon
-unsigned int aom_highbd_12_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x2 aom_highbd_12_variance2x2_c
-
-unsigned int aom_highbd_12_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x4 aom_highbd_12_variance2x4_c
-
-unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x16 aom_highbd_12_variance32x16_neon
-unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x32 aom_highbd_12_variance32x32_neon
-unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x64 aom_highbd_12_variance32x64_neon
-unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x8 aom_highbd_12_variance32x8_neon
-unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x16 aom_highbd_12_variance4x16_neon
-unsigned int aom_highbd_12_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance4x2 aom_highbd_12_variance4x2_c
-
-unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x4 aom_highbd_12_variance4x4_neon
-unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x8 aom_highbd_12_variance4x8_neon
-unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x128 aom_highbd_12_variance64x128_neon
-unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x16 aom_highbd_12_variance64x16_neon
-unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x32 aom_highbd_12_variance64x32_neon
-unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x64 aom_highbd_12_variance64x64_neon
-unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x16 aom_highbd_12_variance8x16_neon
-unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x32 aom_highbd_12_variance8x32_neon
-unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x4 aom_highbd_12_variance8x4_neon
-unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x8 aom_highbd_12_variance8x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64 aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
-#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
+#define aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8 aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance128x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance128x128 aom_highbd_8_masked_sub_pixel_variance128x128_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance128x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance128x128 aom_highbd_8_masked_sub_pixel_variance128x128_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance128x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance128x64 aom_highbd_8_masked_sub_pixel_variance128x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance128x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance128x64 aom_highbd_8_masked_sub_pixel_variance128x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x16 aom_highbd_8_masked_sub_pixel_variance16x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x16 aom_highbd_8_masked_sub_pixel_variance16x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x32 aom_highbd_8_masked_sub_pixel_variance16x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x32 aom_highbd_8_masked_sub_pixel_variance16x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x4 aom_highbd_8_masked_sub_pixel_variance16x4_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x4 aom_highbd_8_masked_sub_pixel_variance16x4_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x64 aom_highbd_8_masked_sub_pixel_variance16x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x64 aom_highbd_8_masked_sub_pixel_variance16x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance16x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance16x8 aom_highbd_8_masked_sub_pixel_variance16x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance16x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance16x8 aom_highbd_8_masked_sub_pixel_variance16x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x16 aom_highbd_8_masked_sub_pixel_variance32x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x16 aom_highbd_8_masked_sub_pixel_variance32x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x32 aom_highbd_8_masked_sub_pixel_variance32x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x32 aom_highbd_8_masked_sub_pixel_variance32x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x64 aom_highbd_8_masked_sub_pixel_variance32x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x64 aom_highbd_8_masked_sub_pixel_variance32x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance32x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance32x8 aom_highbd_8_masked_sub_pixel_variance32x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance32x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance32x8 aom_highbd_8_masked_sub_pixel_variance32x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance4x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance4x16 aom_highbd_8_masked_sub_pixel_variance4x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance4x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance4x16 aom_highbd_8_masked_sub_pixel_variance4x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance4x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance4x4 aom_highbd_8_masked_sub_pixel_variance4x4_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance4x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance4x4 aom_highbd_8_masked_sub_pixel_variance4x4_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance4x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance4x8 aom_highbd_8_masked_sub_pixel_variance4x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance4x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance4x8 aom_highbd_8_masked_sub_pixel_variance4x8_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x128_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x128 aom_highbd_8_masked_sub_pixel_variance64x128_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x128_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x128 aom_highbd_8_masked_sub_pixel_variance64x128_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x16 aom_highbd_8_masked_sub_pixel_variance64x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x16 aom_highbd_8_masked_sub_pixel_variance64x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x32 aom_highbd_8_masked_sub_pixel_variance64x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x32 aom_highbd_8_masked_sub_pixel_variance64x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance64x64_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance64x64 aom_highbd_8_masked_sub_pixel_variance64x64_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance64x64 aom_highbd_8_masked_sub_pixel_variance64x64_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x16_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x16 aom_highbd_8_masked_sub_pixel_variance8x16_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x16 aom_highbd_8_masked_sub_pixel_variance8x16_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x32_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x32 aom_highbd_8_masked_sub_pixel_variance8x32_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x32 aom_highbd_8_masked_sub_pixel_variance8x32_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x4_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x4 aom_highbd_8_masked_sub_pixel_variance8x4_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x4_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x4 aom_highbd_8_masked_sub_pixel_variance8x4_neon
unsigned int aom_highbd_8_masked_sub_pixel_variance8x8_c(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#define aom_highbd_8_masked_sub_pixel_variance8x8 aom_highbd_8_masked_sub_pixel_variance8x8_c
+unsigned int aom_highbd_8_masked_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+#define aom_highbd_8_masked_sub_pixel_variance8x8 aom_highbd_8_masked_sub_pixel_variance8x8_neon
unsigned int aom_highbd_8_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -1817,233 +2156,444 @@ unsigned int aom_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, c
unsigned int aom_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_8_mse8x8 aom_highbd_8_mse8x8_neon
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x128 aom_highbd_8_obmc_sub_pixel_variance128x128_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x64 aom_highbd_8_obmc_sub_pixel_variance128x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x16 aom_highbd_8_obmc_sub_pixel_variance16x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x32 aom_highbd_8_obmc_sub_pixel_variance16x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x4 aom_highbd_8_obmc_sub_pixel_variance16x4_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x64 aom_highbd_8_obmc_sub_pixel_variance16x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x8 aom_highbd_8_obmc_sub_pixel_variance16x8_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x16 aom_highbd_8_obmc_sub_pixel_variance32x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x32 aom_highbd_8_obmc_sub_pixel_variance32x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x64 aom_highbd_8_obmc_sub_pixel_variance32x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x8 aom_highbd_8_obmc_sub_pixel_variance32x8_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x16 aom_highbd_8_obmc_sub_pixel_variance4x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x4 aom_highbd_8_obmc_sub_pixel_variance4x4_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x8 aom_highbd_8_obmc_sub_pixel_variance4x8_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x128 aom_highbd_8_obmc_sub_pixel_variance64x128_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x16 aom_highbd_8_obmc_sub_pixel_variance64x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x32 aom_highbd_8_obmc_sub_pixel_variance64x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x64_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x64 aom_highbd_8_obmc_sub_pixel_variance64x64_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x16_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x16 aom_highbd_8_obmc_sub_pixel_variance8x16_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x32_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x32 aom_highbd_8_obmc_sub_pixel_variance8x32_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x4_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x4 aom_highbd_8_obmc_sub_pixel_variance8x4_neon
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x8_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x8 aom_highbd_8_obmc_sub_pixel_variance8x8_neon
+
+unsigned int aom_highbd_8_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x128 aom_highbd_8_obmc_variance128x128_neon
+
+unsigned int aom_highbd_8_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x64 aom_highbd_8_obmc_variance128x64_neon
+
+unsigned int aom_highbd_8_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x16 aom_highbd_8_obmc_variance16x16_neon
+
+unsigned int aom_highbd_8_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x32 aom_highbd_8_obmc_variance16x32_neon
+
+unsigned int aom_highbd_8_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x4 aom_highbd_8_obmc_variance16x4_neon
+
+unsigned int aom_highbd_8_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x64 aom_highbd_8_obmc_variance16x64_neon
+
+unsigned int aom_highbd_8_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x8 aom_highbd_8_obmc_variance16x8_neon
+
+unsigned int aom_highbd_8_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x16 aom_highbd_8_obmc_variance32x16_neon
+
+unsigned int aom_highbd_8_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x32 aom_highbd_8_obmc_variance32x32_neon
+
+unsigned int aom_highbd_8_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x64 aom_highbd_8_obmc_variance32x64_neon
+
+unsigned int aom_highbd_8_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x8 aom_highbd_8_obmc_variance32x8_neon
+
+unsigned int aom_highbd_8_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x16 aom_highbd_8_obmc_variance4x16_neon
+
+unsigned int aom_highbd_8_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x4 aom_highbd_8_obmc_variance4x4_neon
+
+unsigned int aom_highbd_8_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x8 aom_highbd_8_obmc_variance4x8_neon
+
+unsigned int aom_highbd_8_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x128 aom_highbd_8_obmc_variance64x128_neon
+
+unsigned int aom_highbd_8_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x16 aom_highbd_8_obmc_variance64x16_neon
+
+unsigned int aom_highbd_8_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x32 aom_highbd_8_obmc_variance64x32_neon
+
+unsigned int aom_highbd_8_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x64 aom_highbd_8_obmc_variance64x64_neon
+
+unsigned int aom_highbd_8_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x16 aom_highbd_8_obmc_variance8x16_neon
+
+unsigned int aom_highbd_8_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x32 aom_highbd_8_obmc_variance8x32_neon
+
+unsigned int aom_highbd_8_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x4 aom_highbd_8_obmc_variance8x4_neon
+
+unsigned int aom_highbd_8_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+unsigned int aom_highbd_8_obmc_variance8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x8 aom_highbd_8_obmc_variance8x8_neon
+
uint32_t aom_highbd_8_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance128x128 aom_highbd_8_sub_pixel_avg_variance128x128_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance128x128 aom_highbd_8_sub_pixel_avg_variance128x128_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance128x64 aom_highbd_8_sub_pixel_avg_variance128x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance128x64 aom_highbd_8_sub_pixel_avg_variance128x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x16 aom_highbd_8_sub_pixel_avg_variance16x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x16 aom_highbd_8_sub_pixel_avg_variance16x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x32 aom_highbd_8_sub_pixel_avg_variance16x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x32 aom_highbd_8_sub_pixel_avg_variance16x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x4 aom_highbd_8_sub_pixel_avg_variance16x4_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x4 aom_highbd_8_sub_pixel_avg_variance16x4_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x64 aom_highbd_8_sub_pixel_avg_variance16x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x64 aom_highbd_8_sub_pixel_avg_variance16x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance16x8 aom_highbd_8_sub_pixel_avg_variance16x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance16x8 aom_highbd_8_sub_pixel_avg_variance16x8_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x16 aom_highbd_8_sub_pixel_avg_variance32x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x16 aom_highbd_8_sub_pixel_avg_variance32x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x32 aom_highbd_8_sub_pixel_avg_variance32x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x32 aom_highbd_8_sub_pixel_avg_variance32x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x64 aom_highbd_8_sub_pixel_avg_variance32x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x64 aom_highbd_8_sub_pixel_avg_variance32x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance32x8 aom_highbd_8_sub_pixel_avg_variance32x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance32x8 aom_highbd_8_sub_pixel_avg_variance32x8_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance4x16 aom_highbd_8_sub_pixel_avg_variance4x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance4x16 aom_highbd_8_sub_pixel_avg_variance4x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance4x4 aom_highbd_8_sub_pixel_avg_variance4x4_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance4x4 aom_highbd_8_sub_pixel_avg_variance4x4_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance4x8 aom_highbd_8_sub_pixel_avg_variance4x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance4x8 aom_highbd_8_sub_pixel_avg_variance4x8_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x128 aom_highbd_8_sub_pixel_avg_variance64x128_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x128 aom_highbd_8_sub_pixel_avg_variance64x128_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x16 aom_highbd_8_sub_pixel_avg_variance64x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x16 aom_highbd_8_sub_pixel_avg_variance64x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x32 aom_highbd_8_sub_pixel_avg_variance64x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x32 aom_highbd_8_sub_pixel_avg_variance64x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance64x64 aom_highbd_8_sub_pixel_avg_variance64x64_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance64x64 aom_highbd_8_sub_pixel_avg_variance64x64_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x16 aom_highbd_8_sub_pixel_avg_variance8x16_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x16 aom_highbd_8_sub_pixel_avg_variance8x16_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x32 aom_highbd_8_sub_pixel_avg_variance8x32_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x32 aom_highbd_8_sub_pixel_avg_variance8x32_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x4 aom_highbd_8_sub_pixel_avg_variance8x4_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x4 aom_highbd_8_sub_pixel_avg_variance8x4_neon
uint32_t aom_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define aom_highbd_8_sub_pixel_avg_variance8x8 aom_highbd_8_sub_pixel_avg_variance8x8_c
+uint32_t aom_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define aom_highbd_8_sub_pixel_avg_variance8x8 aom_highbd_8_sub_pixel_avg_variance8x8_neon
uint32_t aom_highbd_8_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance128x128 aom_highbd_8_sub_pixel_variance128x128_c
+uint32_t aom_highbd_8_sub_pixel_variance128x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance128x128 aom_highbd_8_sub_pixel_variance128x128_neon
uint32_t aom_highbd_8_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance128x64 aom_highbd_8_sub_pixel_variance128x64_c
+uint32_t aom_highbd_8_sub_pixel_variance128x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance128x64 aom_highbd_8_sub_pixel_variance128x64_neon
uint32_t aom_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x16 aom_highbd_8_sub_pixel_variance16x16_c
+uint32_t aom_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x16 aom_highbd_8_sub_pixel_variance16x16_neon
uint32_t aom_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x32 aom_highbd_8_sub_pixel_variance16x32_c
+uint32_t aom_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x32 aom_highbd_8_sub_pixel_variance16x32_neon
uint32_t aom_highbd_8_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x4 aom_highbd_8_sub_pixel_variance16x4_c
+uint32_t aom_highbd_8_sub_pixel_variance16x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x4 aom_highbd_8_sub_pixel_variance16x4_neon
uint32_t aom_highbd_8_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x64 aom_highbd_8_sub_pixel_variance16x64_c
+uint32_t aom_highbd_8_sub_pixel_variance16x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x64 aom_highbd_8_sub_pixel_variance16x64_neon
uint32_t aom_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance16x8 aom_highbd_8_sub_pixel_variance16x8_c
+uint32_t aom_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance16x8 aom_highbd_8_sub_pixel_variance16x8_neon
uint32_t aom_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x16 aom_highbd_8_sub_pixel_variance32x16_c
+uint32_t aom_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x16 aom_highbd_8_sub_pixel_variance32x16_neon
uint32_t aom_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x32 aom_highbd_8_sub_pixel_variance32x32_c
+uint32_t aom_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x32 aom_highbd_8_sub_pixel_variance32x32_neon
uint32_t aom_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x64 aom_highbd_8_sub_pixel_variance32x64_c
+uint32_t aom_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x64 aom_highbd_8_sub_pixel_variance32x64_neon
uint32_t aom_highbd_8_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance32x8 aom_highbd_8_sub_pixel_variance32x8_c
+uint32_t aom_highbd_8_sub_pixel_variance32x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance32x8 aom_highbd_8_sub_pixel_variance32x8_neon
uint32_t aom_highbd_8_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance4x16 aom_highbd_8_sub_pixel_variance4x16_c
+uint32_t aom_highbd_8_sub_pixel_variance4x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance4x16 aom_highbd_8_sub_pixel_variance4x16_neon
uint32_t aom_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance4x4 aom_highbd_8_sub_pixel_variance4x4_c
+uint32_t aom_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance4x4 aom_highbd_8_sub_pixel_variance4x4_neon
uint32_t aom_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance4x8 aom_highbd_8_sub_pixel_variance4x8_c
+uint32_t aom_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance4x8 aom_highbd_8_sub_pixel_variance4x8_neon
uint32_t aom_highbd_8_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x128 aom_highbd_8_sub_pixel_variance64x128_c
+uint32_t aom_highbd_8_sub_pixel_variance64x128_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x128 aom_highbd_8_sub_pixel_variance64x128_neon
uint32_t aom_highbd_8_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x16 aom_highbd_8_sub_pixel_variance64x16_c
+uint32_t aom_highbd_8_sub_pixel_variance64x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x16 aom_highbd_8_sub_pixel_variance64x16_neon
uint32_t aom_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x32 aom_highbd_8_sub_pixel_variance64x32_c
+uint32_t aom_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x32 aom_highbd_8_sub_pixel_variance64x32_neon
uint32_t aom_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance64x64 aom_highbd_8_sub_pixel_variance64x64_c
+uint32_t aom_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance64x64 aom_highbd_8_sub_pixel_variance64x64_neon
uint32_t aom_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x16 aom_highbd_8_sub_pixel_variance8x16_c
+uint32_t aom_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x16 aom_highbd_8_sub_pixel_variance8x16_neon
uint32_t aom_highbd_8_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x32 aom_highbd_8_sub_pixel_variance8x32_c
+uint32_t aom_highbd_8_sub_pixel_variance8x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x32 aom_highbd_8_sub_pixel_variance8x32_neon
uint32_t aom_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x4 aom_highbd_8_sub_pixel_variance8x4_c
+uint32_t aom_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x4 aom_highbd_8_sub_pixel_variance8x4_neon
uint32_t aom_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define aom_highbd_8_sub_pixel_variance8x8 aom_highbd_8_sub_pixel_variance8x8_c
+uint32_t aom_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define aom_highbd_8_sub_pixel_variance8x8 aom_highbd_8_sub_pixel_variance8x8_neon
-unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x128 aom_highbd_8_variance128x128_neon
-unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x64 aom_highbd_8_variance128x64_neon
-unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x16 aom_highbd_8_variance16x16_neon
-unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x32 aom_highbd_8_variance16x32_neon
-unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x4 aom_highbd_8_variance16x4_neon
-unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x64 aom_highbd_8_variance16x64_neon
-unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x8 aom_highbd_8_variance16x8_neon
-unsigned int aom_highbd_8_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x2 aom_highbd_8_variance2x2_c
-
-unsigned int aom_highbd_8_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x4 aom_highbd_8_variance2x4_c
-
-unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x16 aom_highbd_8_variance32x16_neon
-unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x32 aom_highbd_8_variance32x32_neon
-unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x64 aom_highbd_8_variance32x64_neon
-unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x8 aom_highbd_8_variance32x8_neon
-unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x16 aom_highbd_8_variance4x16_neon
-unsigned int aom_highbd_8_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance4x2 aom_highbd_8_variance4x2_c
-
-unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x4 aom_highbd_8_variance4x4_neon
-unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x8 aom_highbd_8_variance4x8_neon
-unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x128_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x128 aom_highbd_8_variance64x128_neon
-unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x16 aom_highbd_8_variance64x16_neon
-unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x32 aom_highbd_8_variance64x32_neon
-unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x64 aom_highbd_8_variance64x64_neon
-unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x16 aom_highbd_8_variance8x16_neon
-unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x32 aom_highbd_8_variance8x32_neon
-unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x4 aom_highbd_8_variance8x4_neon
-unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x8 aom_highbd_8_variance8x8_neon
unsigned int aom_highbd_avg_4x4_c(const uint8_t *, int p);
@@ -2055,31 +2605,40 @@ unsigned int aom_highbd_avg_8x8_neon(const uint8_t *, int p);
#define aom_highbd_avg_8x8 aom_highbd_avg_8x8_neon
void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
-#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+void aom_highbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_neon
void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
-#define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_neon
void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
-#define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
+#define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_neon
void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
-#define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_neon
void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
-#define aom_highbd_comp_avg_pred aom_highbd_comp_avg_pred_c
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define aom_highbd_comp_avg_pred aom_highbd_comp_avg_pred_neon
void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
-#define aom_highbd_comp_mask_pred aom_highbd_comp_mask_pred_c
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+#define aom_highbd_comp_mask_pred aom_highbd_comp_mask_pred_neon
void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
-#define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_c
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+#define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_neon
void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
-#define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_c
+void aom_highbd_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+#define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_neon
void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h);
-#define aom_highbd_convolve_copy aom_highbd_convolve_copy_c
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h);
+#define aom_highbd_convolve_copy aom_highbd_convolve_copy_neon
void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
void aom_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -2386,7 +2945,8 @@ void aom_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, con
#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_neon
void aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
-#define aom_highbd_dist_wtd_comp_avg_pred aom_highbd_dist_wtd_comp_avg_pred_c
+void aom_highbd_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param);
+#define aom_highbd_dist_wtd_comp_avg_pred aom_highbd_dist_wtd_comp_avg_pred_neon
unsigned int aom_highbd_dist_wtd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
#define aom_highbd_dist_wtd_sad128x128_avg aom_highbd_dist_wtd_sad128x128_avg_c
@@ -2607,272 +3167,184 @@ void aom_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *
#define aom_highbd_lpf_vertical_8_dual aom_highbd_lpf_vertical_8_dual_neon
unsigned int aom_highbd_masked_sad128x128_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad128x128 aom_highbd_masked_sad128x128_c
+unsigned int aom_highbd_masked_sad128x128_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad128x128 aom_highbd_masked_sad128x128_neon
unsigned int aom_highbd_masked_sad128x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad128x64 aom_highbd_masked_sad128x64_c
+unsigned int aom_highbd_masked_sad128x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad128x64 aom_highbd_masked_sad128x64_neon
unsigned int aom_highbd_masked_sad16x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x16 aom_highbd_masked_sad16x16_c
+unsigned int aom_highbd_masked_sad16x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x16 aom_highbd_masked_sad16x16_neon
unsigned int aom_highbd_masked_sad16x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x32 aom_highbd_masked_sad16x32_c
+unsigned int aom_highbd_masked_sad16x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x32 aom_highbd_masked_sad16x32_neon
unsigned int aom_highbd_masked_sad16x4_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x4 aom_highbd_masked_sad16x4_c
+unsigned int aom_highbd_masked_sad16x4_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x4 aom_highbd_masked_sad16x4_neon
unsigned int aom_highbd_masked_sad16x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x64 aom_highbd_masked_sad16x64_c
+unsigned int aom_highbd_masked_sad16x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x64 aom_highbd_masked_sad16x64_neon
unsigned int aom_highbd_masked_sad16x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad16x8 aom_highbd_masked_sad16x8_c
+unsigned int aom_highbd_masked_sad16x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad16x8 aom_highbd_masked_sad16x8_neon
unsigned int aom_highbd_masked_sad32x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x16 aom_highbd_masked_sad32x16_c
+unsigned int aom_highbd_masked_sad32x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x16 aom_highbd_masked_sad32x16_neon
unsigned int aom_highbd_masked_sad32x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x32 aom_highbd_masked_sad32x32_c
+unsigned int aom_highbd_masked_sad32x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x32 aom_highbd_masked_sad32x32_neon
unsigned int aom_highbd_masked_sad32x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x64 aom_highbd_masked_sad32x64_c
+unsigned int aom_highbd_masked_sad32x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x64 aom_highbd_masked_sad32x64_neon
unsigned int aom_highbd_masked_sad32x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad32x8 aom_highbd_masked_sad32x8_c
+unsigned int aom_highbd_masked_sad32x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad32x8 aom_highbd_masked_sad32x8_neon
unsigned int aom_highbd_masked_sad4x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad4x16 aom_highbd_masked_sad4x16_c
+unsigned int aom_highbd_masked_sad4x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad4x16 aom_highbd_masked_sad4x16_neon
unsigned int aom_highbd_masked_sad4x4_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad4x4 aom_highbd_masked_sad4x4_c
+unsigned int aom_highbd_masked_sad4x4_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad4x4 aom_highbd_masked_sad4x4_neon
unsigned int aom_highbd_masked_sad4x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad4x8 aom_highbd_masked_sad4x8_c
+unsigned int aom_highbd_masked_sad4x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad4x8 aom_highbd_masked_sad4x8_neon
unsigned int aom_highbd_masked_sad64x128_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x128 aom_highbd_masked_sad64x128_c
+unsigned int aom_highbd_masked_sad64x128_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x128 aom_highbd_masked_sad64x128_neon
unsigned int aom_highbd_masked_sad64x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x16 aom_highbd_masked_sad64x16_c
+unsigned int aom_highbd_masked_sad64x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x16 aom_highbd_masked_sad64x16_neon
unsigned int aom_highbd_masked_sad64x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x32 aom_highbd_masked_sad64x32_c
+unsigned int aom_highbd_masked_sad64x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x32 aom_highbd_masked_sad64x32_neon
unsigned int aom_highbd_masked_sad64x64_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad64x64 aom_highbd_masked_sad64x64_c
+unsigned int aom_highbd_masked_sad64x64_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad64x64 aom_highbd_masked_sad64x64_neon
unsigned int aom_highbd_masked_sad8x16_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x16 aom_highbd_masked_sad8x16_c
+unsigned int aom_highbd_masked_sad8x16_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x16 aom_highbd_masked_sad8x16_neon
unsigned int aom_highbd_masked_sad8x32_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x32 aom_highbd_masked_sad8x32_c
+unsigned int aom_highbd_masked_sad8x32_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x32 aom_highbd_masked_sad8x32_neon
unsigned int aom_highbd_masked_sad8x4_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x4 aom_highbd_masked_sad8x4_c
+unsigned int aom_highbd_masked_sad8x4_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x4 aom_highbd_masked_sad8x4_neon
unsigned int aom_highbd_masked_sad8x8_c(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
-#define aom_highbd_masked_sad8x8 aom_highbd_masked_sad8x8_c
+unsigned int aom_highbd_masked_sad8x8_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask);
+#define aom_highbd_masked_sad8x8 aom_highbd_masked_sad8x8_neon
void aom_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void aom_highbd_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
#define aom_highbd_minmax_8x8 aom_highbd_minmax_8x8_neon
unsigned int aom_highbd_obmc_sad128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad128x128 aom_highbd_obmc_sad128x128_c
+unsigned int aom_highbd_obmc_sad128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad128x128 aom_highbd_obmc_sad128x128_neon
unsigned int aom_highbd_obmc_sad128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad128x64 aom_highbd_obmc_sad128x64_c
+unsigned int aom_highbd_obmc_sad128x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad128x64 aom_highbd_obmc_sad128x64_neon
unsigned int aom_highbd_obmc_sad16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x16 aom_highbd_obmc_sad16x16_c
+unsigned int aom_highbd_obmc_sad16x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x16 aom_highbd_obmc_sad16x16_neon
unsigned int aom_highbd_obmc_sad16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x32 aom_highbd_obmc_sad16x32_c
+unsigned int aom_highbd_obmc_sad16x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x32 aom_highbd_obmc_sad16x32_neon
unsigned int aom_highbd_obmc_sad16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x4 aom_highbd_obmc_sad16x4_c
+unsigned int aom_highbd_obmc_sad16x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x4 aom_highbd_obmc_sad16x4_neon
unsigned int aom_highbd_obmc_sad16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x64 aom_highbd_obmc_sad16x64_c
+unsigned int aom_highbd_obmc_sad16x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x64 aom_highbd_obmc_sad16x64_neon
unsigned int aom_highbd_obmc_sad16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad16x8 aom_highbd_obmc_sad16x8_c
+unsigned int aom_highbd_obmc_sad16x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad16x8 aom_highbd_obmc_sad16x8_neon
unsigned int aom_highbd_obmc_sad32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x16 aom_highbd_obmc_sad32x16_c
+unsigned int aom_highbd_obmc_sad32x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x16 aom_highbd_obmc_sad32x16_neon
unsigned int aom_highbd_obmc_sad32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x32 aom_highbd_obmc_sad32x32_c
+unsigned int aom_highbd_obmc_sad32x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x32 aom_highbd_obmc_sad32x32_neon
unsigned int aom_highbd_obmc_sad32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x64 aom_highbd_obmc_sad32x64_c
+unsigned int aom_highbd_obmc_sad32x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x64 aom_highbd_obmc_sad32x64_neon
unsigned int aom_highbd_obmc_sad32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad32x8 aom_highbd_obmc_sad32x8_c
+unsigned int aom_highbd_obmc_sad32x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad32x8 aom_highbd_obmc_sad32x8_neon
unsigned int aom_highbd_obmc_sad4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad4x16 aom_highbd_obmc_sad4x16_c
+unsigned int aom_highbd_obmc_sad4x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad4x16 aom_highbd_obmc_sad4x16_neon
unsigned int aom_highbd_obmc_sad4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad4x4 aom_highbd_obmc_sad4x4_c
+unsigned int aom_highbd_obmc_sad4x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad4x4 aom_highbd_obmc_sad4x4_neon
unsigned int aom_highbd_obmc_sad4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad4x8 aom_highbd_obmc_sad4x8_c
+unsigned int aom_highbd_obmc_sad4x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad4x8 aom_highbd_obmc_sad4x8_neon
unsigned int aom_highbd_obmc_sad64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x128 aom_highbd_obmc_sad64x128_c
+unsigned int aom_highbd_obmc_sad64x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x128 aom_highbd_obmc_sad64x128_neon
unsigned int aom_highbd_obmc_sad64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x16 aom_highbd_obmc_sad64x16_c
+unsigned int aom_highbd_obmc_sad64x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x16 aom_highbd_obmc_sad64x16_neon
unsigned int aom_highbd_obmc_sad64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x32 aom_highbd_obmc_sad64x32_c
+unsigned int aom_highbd_obmc_sad64x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x32 aom_highbd_obmc_sad64x32_neon
unsigned int aom_highbd_obmc_sad64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad64x64 aom_highbd_obmc_sad64x64_c
+unsigned int aom_highbd_obmc_sad64x64_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad64x64 aom_highbd_obmc_sad64x64_neon
unsigned int aom_highbd_obmc_sad8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x16 aom_highbd_obmc_sad8x16_c
+unsigned int aom_highbd_obmc_sad8x16_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x16 aom_highbd_obmc_sad8x16_neon
unsigned int aom_highbd_obmc_sad8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x32 aom_highbd_obmc_sad8x32_c
+unsigned int aom_highbd_obmc_sad8x32_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x32 aom_highbd_obmc_sad8x32_neon
unsigned int aom_highbd_obmc_sad8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x4 aom_highbd_obmc_sad8x4_c
+unsigned int aom_highbd_obmc_sad8x4_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x4 aom_highbd_obmc_sad8x4_neon
unsigned int aom_highbd_obmc_sad8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
-#define aom_highbd_obmc_sad8x8 aom_highbd_obmc_sad8x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x128 aom_highbd_obmc_sub_pixel_variance128x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x64 aom_highbd_obmc_sub_pixel_variance128x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x16 aom_highbd_obmc_sub_pixel_variance16x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x32 aom_highbd_obmc_sub_pixel_variance16x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x4 aom_highbd_obmc_sub_pixel_variance16x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x64 aom_highbd_obmc_sub_pixel_variance16x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x8 aom_highbd_obmc_sub_pixel_variance16x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x16 aom_highbd_obmc_sub_pixel_variance32x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x32 aom_highbd_obmc_sub_pixel_variance32x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x64 aom_highbd_obmc_sub_pixel_variance32x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x8 aom_highbd_obmc_sub_pixel_variance32x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x16 aom_highbd_obmc_sub_pixel_variance4x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x4 aom_highbd_obmc_sub_pixel_variance4x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x8 aom_highbd_obmc_sub_pixel_variance4x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x128 aom_highbd_obmc_sub_pixel_variance64x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x16 aom_highbd_obmc_sub_pixel_variance64x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x32 aom_highbd_obmc_sub_pixel_variance64x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x64 aom_highbd_obmc_sub_pixel_variance64x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x16 aom_highbd_obmc_sub_pixel_variance8x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x32 aom_highbd_obmc_sub_pixel_variance8x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x4 aom_highbd_obmc_sub_pixel_variance8x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x8 aom_highbd_obmc_sub_pixel_variance8x8_c
-
-unsigned int aom_highbd_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x128 aom_highbd_obmc_variance128x128_c
-
-unsigned int aom_highbd_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x64 aom_highbd_obmc_variance128x64_c
-
-unsigned int aom_highbd_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x16 aom_highbd_obmc_variance16x16_c
-
-unsigned int aom_highbd_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x32 aom_highbd_obmc_variance16x32_c
-
-unsigned int aom_highbd_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x4 aom_highbd_obmc_variance16x4_c
-
-unsigned int aom_highbd_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x64 aom_highbd_obmc_variance16x64_c
-
-unsigned int aom_highbd_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x8 aom_highbd_obmc_variance16x8_c
-
-unsigned int aom_highbd_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x16 aom_highbd_obmc_variance32x16_c
-
-unsigned int aom_highbd_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x32 aom_highbd_obmc_variance32x32_c
-
-unsigned int aom_highbd_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x64 aom_highbd_obmc_variance32x64_c
-
-unsigned int aom_highbd_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x8 aom_highbd_obmc_variance32x8_c
-
-unsigned int aom_highbd_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x16 aom_highbd_obmc_variance4x16_c
-
-unsigned int aom_highbd_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x4 aom_highbd_obmc_variance4x4_c
-
-unsigned int aom_highbd_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x8 aom_highbd_obmc_variance4x8_c
-
-unsigned int aom_highbd_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x128 aom_highbd_obmc_variance64x128_c
-
-unsigned int aom_highbd_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x16 aom_highbd_obmc_variance64x16_c
-
-unsigned int aom_highbd_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x32 aom_highbd_obmc_variance64x32_c
-
-unsigned int aom_highbd_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x64 aom_highbd_obmc_variance64x64_c
-
-unsigned int aom_highbd_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x16 aom_highbd_obmc_variance8x16_c
-
-unsigned int aom_highbd_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x32 aom_highbd_obmc_variance8x32_c
-
-unsigned int aom_highbd_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x4 aom_highbd_obmc_variance8x4_c
-
-unsigned int aom_highbd_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x8 aom_highbd_obmc_variance8x8_c
+unsigned int aom_highbd_obmc_sad8x8_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
+#define aom_highbd_obmc_sad8x8 aom_highbd_obmc_sad8x8_neon
void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
void aom_highbd_paeth_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -2979,13 +3451,15 @@ unsigned int aom_highbd_sad128x128_neon(const uint8_t *src_ptr, int src_stride,
#define aom_highbd_sad128x128 aom_highbd_sad128x128_neon
unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
+unsigned int aom_highbd_sad128x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_neon
-void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x128x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_neon
-void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_neon
unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2993,13 +3467,15 @@ unsigned int aom_highbd_sad128x64_neon(const uint8_t *src_ptr, int src_stride, c
#define aom_highbd_sad128x64 aom_highbd_sad128x64_neon
unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
+unsigned int aom_highbd_sad128x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_neon
-void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_neon
-void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_neon
unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3007,13 +3483,15 @@ unsigned int aom_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad16x16 aom_highbd_sad16x16_neon
unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_c
+unsigned int aom_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_neon
-void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_neon
-void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_neon
unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3021,13 +3499,15 @@ unsigned int aom_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad16x32 aom_highbd_sad16x32_neon
unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_c
+unsigned int aom_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_neon
-void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_neon
-void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_neon
unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3035,13 +3515,15 @@ unsigned int aom_highbd_sad16x4_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad16x4 aom_highbd_sad16x4_neon
unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_c
+unsigned int aom_highbd_sad16x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_neon
-void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x4x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_neon
-void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_neon
unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3049,13 +3531,15 @@ unsigned int aom_highbd_sad16x64_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad16x64 aom_highbd_sad16x64_neon
unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_c
+unsigned int aom_highbd_sad16x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_neon
-void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_neon
-void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_neon
unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3063,13 +3547,15 @@ unsigned int aom_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad16x8 aom_highbd_sad16x8_neon
unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_c
+unsigned int aom_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_neon
-void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_neon
-void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_neon
unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3077,13 +3563,15 @@ unsigned int aom_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad32x16 aom_highbd_sad32x16_neon
unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_c
+unsigned int aom_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_neon
-void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_neon
-void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_neon
unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3091,13 +3579,15 @@ unsigned int aom_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad32x32 aom_highbd_sad32x32_neon
unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_c
+unsigned int aom_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_neon
-void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_neon
-void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_neon
unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3105,13 +3595,15 @@ unsigned int aom_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad32x64 aom_highbd_sad32x64_neon
unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_c
+unsigned int aom_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_neon
-void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_neon
-void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_neon
unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3119,13 +3611,15 @@ unsigned int aom_highbd_sad32x8_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad32x8 aom_highbd_sad32x8_neon
unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_c
+unsigned int aom_highbd_sad32x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_neon
-void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_neon
-void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_neon
unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3133,13 +3627,15 @@ unsigned int aom_highbd_sad4x16_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad4x16 aom_highbd_sad4x16_neon
unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_c
+unsigned int aom_highbd_sad4x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_neon
-void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_neon
-void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_neon
unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3147,13 +3643,15 @@ unsigned int aom_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad4x4 aom_highbd_sad4x4_neon
unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_c
+unsigned int aom_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_neon
-void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x4x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_neon
-void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_neon
unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3161,13 +3659,15 @@ unsigned int aom_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad4x8 aom_highbd_sad4x8_neon
unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_c
+unsigned int aom_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_neon
-void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_neon
-void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_neon
unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3175,13 +3675,15 @@ unsigned int aom_highbd_sad64x128_neon(const uint8_t *src_ptr, int src_stride, c
#define aom_highbd_sad64x128 aom_highbd_sad64x128_neon
unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
+unsigned int aom_highbd_sad64x128_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_neon
-void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x128x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_neon
-void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_neon
unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3189,13 +3691,15 @@ unsigned int aom_highbd_sad64x16_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad64x16 aom_highbd_sad64x16_neon
unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_c
+unsigned int aom_highbd_sad64x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_neon
-void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_neon
-void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_neon
unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3203,13 +3707,15 @@ unsigned int aom_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad64x32 aom_highbd_sad64x32_neon
unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_c
+unsigned int aom_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_neon
-void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_neon
-void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_neon
unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3217,13 +3723,15 @@ unsigned int aom_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, co
#define aom_highbd_sad64x64 aom_highbd_sad64x64_neon
unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_c
+unsigned int aom_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_neon
-void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x64x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_neon
-void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_neon
unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3231,13 +3739,15 @@ unsigned int aom_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad8x16 aom_highbd_sad8x16_neon
unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_c
+unsigned int aom_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_neon
-void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x16x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_neon
-void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_neon
unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3245,13 +3755,15 @@ unsigned int aom_highbd_sad8x32_neon(const uint8_t *src_ptr, int src_stride, con
#define aom_highbd_sad8x32 aom_highbd_sad8x32_neon
unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_c
+unsigned int aom_highbd_sad8x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_neon
-void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x32x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_neon
-void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_neon
unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3259,13 +3771,15 @@ unsigned int aom_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad8x4 aom_highbd_sad8x4_neon
unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_c
+unsigned int aom_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_neon
-void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x4x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_neon
-void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_neon
unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3273,189 +3787,191 @@ unsigned int aom_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, cons
#define aom_highbd_sad8x8 aom_highbd_sad8x8_neon
unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_c
+unsigned int aom_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_neon
-void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x8x3d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_neon
-void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_neon
unsigned int aom_highbd_sad_skip_128x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_128x128_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x128 aom_highbd_sad_skip_128x128_neon
-void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_128x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x128x4d aom_highbd_sad_skip_128x128x4d_neon
unsigned int aom_highbd_sad_skip_128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_128x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x64 aom_highbd_sad_skip_128x64_neon
-void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_128x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x64x4d aom_highbd_sad_skip_128x64x4d_neon
unsigned int aom_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x16 aom_highbd_sad_skip_16x16_neon
-void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x16x4d aom_highbd_sad_skip_16x16x4d_neon
unsigned int aom_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x32 aom_highbd_sad_skip_16x32_neon
-void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x32x4d aom_highbd_sad_skip_16x32x4d_neon
unsigned int aom_highbd_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x4 aom_highbd_sad_skip_16x4_neon
-void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x4x4d aom_highbd_sad_skip_16x4x4d_neon
unsigned int aom_highbd_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x64 aom_highbd_sad_skip_16x64_neon
-void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x64x4d aom_highbd_sad_skip_16x64x4d_neon
unsigned int aom_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x8 aom_highbd_sad_skip_16x8_neon
-void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x8x4d aom_highbd_sad_skip_16x8x4d_neon
unsigned int aom_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x16 aom_highbd_sad_skip_32x16_neon
-void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x16x4d aom_highbd_sad_skip_32x16x4d_neon
unsigned int aom_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x32 aom_highbd_sad_skip_32x32_neon
-void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x32x4d aom_highbd_sad_skip_32x32x4d_neon
unsigned int aom_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x64 aom_highbd_sad_skip_32x64_neon
-void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x64x4d aom_highbd_sad_skip_32x64x4d_neon
unsigned int aom_highbd_sad_skip_32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x8 aom_highbd_sad_skip_32x8_neon
-void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x8x4d aom_highbd_sad_skip_32x8x4d_neon
unsigned int aom_highbd_sad_skip_4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x16 aom_highbd_sad_skip_4x16_neon
-void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x16x4d aom_highbd_sad_skip_4x16x4d_neon
unsigned int aom_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x4 aom_highbd_sad_skip_4x4_neon
-void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x4x4d aom_highbd_sad_skip_4x4x4d_neon
unsigned int aom_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x8 aom_highbd_sad_skip_4x8_neon
-void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x8x4d aom_highbd_sad_skip_4x8x4d_neon
unsigned int aom_highbd_sad_skip_64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x128_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x128 aom_highbd_sad_skip_64x128_neon
-void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x128x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x128x4d aom_highbd_sad_skip_64x128x4d_neon
unsigned int aom_highbd_sad_skip_64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x16 aom_highbd_sad_skip_64x16_neon
-void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x16x4d aom_highbd_sad_skip_64x16x4d_neon
unsigned int aom_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x32 aom_highbd_sad_skip_64x32_neon
-void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x32x4d aom_highbd_sad_skip_64x32x4d_neon
unsigned int aom_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x64 aom_highbd_sad_skip_64x64_neon
-void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x64x4d aom_highbd_sad_skip_64x64x4d_neon
unsigned int aom_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x16 aom_highbd_sad_skip_8x16_neon
-void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x16x4d aom_highbd_sad_skip_8x16x4d_neon
unsigned int aom_highbd_sad_skip_8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x32 aom_highbd_sad_skip_8x32_neon
-void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x32x4d aom_highbd_sad_skip_8x32x4d_neon
unsigned int aom_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x4 aom_highbd_sad_skip_8x4_neon
-void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x4x4d aom_highbd_sad_skip_8x4x4d_neon
unsigned int aom_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x8 aom_highbd_sad_skip_8x8_neon
-void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x8x4d aom_highbd_sad_skip_8x8x4d_neon
void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -4181,14 +4697,16 @@ unsigned int aom_mse8x8_neon(const uint8_t *src_ptr, int source_stride, const u
#define aom_mse8x8 aom_mse8x8_neon
uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
-#define aom_mse_16xh_16bit aom_mse_16xh_16bit_c
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride,uint16_t *src, int w, int h);
+#define aom_mse_16xh_16bit aom_mse_16xh_16bit_neon
uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
#define aom_mse_wxh_16bit aom_mse_wxh_16bit_neon
uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
-#define aom_mse_wxh_16bit_highbd aom_mse_wxh_16bit_highbd_c
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h);
+#define aom_mse_wxh_16bit_highbd aom_mse_wxh_16bit_highbd_neon
unsigned int aom_obmc_sad128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
unsigned int aom_obmc_sad128x128_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
@@ -5630,12 +6148,6 @@ unsigned int aom_variance16x8_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance16x8 aom_variance16x8_neon
-unsigned int aom_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x2 aom_variance2x2_c
-
-unsigned int aom_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x4 aom_variance2x4_c
-
unsigned int aom_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance32x16 aom_variance32x16_neon
@@ -5656,9 +6168,6 @@ unsigned int aom_variance4x16_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance4x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x16 aom_variance4x16_neon
-unsigned int aom_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance4x2 aom_variance4x2_c
-
unsigned int aom_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance4x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x4 aom_variance4x4_neon
diff --git a/config/arm64/config/av1_rtcd.h b/config/arm64/config/av1_rtcd.h
index 1a3fa19ca..338b4a087 100644
--- a/config/arm64/config/av1_rtcd.h
+++ b/config/arm64/config/av1_rtcd.h
@@ -90,23 +90,37 @@ void aom_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Com
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
-#define aom_dist_wtd_comp_avg_upsampled_pred aom_dist_wtd_comp_avg_upsampled_pred_c
+void aom_dist_wtd_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+#define aom_dist_wtd_comp_avg_upsampled_pred aom_dist_wtd_comp_avg_upsampled_pred_neon
void aom_highbd_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
-#define aom_highbd_comp_avg_upsampled_pred aom_highbd_comp_avg_upsampled_pred_c
+void aom_highbd_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
+#define aom_highbd_comp_avg_upsampled_pred aom_highbd_comp_avg_upsampled_pred_neon
void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
-#define aom_highbd_dist_wtd_comp_avg_upsampled_pred aom_highbd_dist_wtd_comp_avg_upsampled_pred_c
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+#define aom_highbd_dist_wtd_comp_avg_upsampled_pred aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon
void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
-#define aom_highbd_upsampled_pred aom_highbd_upsampled_pred_c
+void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search);
+#define aom_highbd_upsampled_pred aom_highbd_upsampled_pred_neon
void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale);
void aom_quantize_b_helper_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale);
@@ -120,8 +134,8 @@ void aom_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm,
int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search);
#define aom_upsampled_pred aom_upsampled_pred_neon
-void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
-void av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+int av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+int av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_neon
void av1_apply_temporal_filter_c(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
@@ -137,17 +151,16 @@ int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, in
#define av1_block_error_lp av1_block_error_lp_neon
void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
-#define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
+void av1_build_compound_diffwtd_mask_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+#define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_neon
void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
void av1_build_compound_diffwtd_mask_d16_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
#define av1_build_compound_diffwtd_mask_d16 av1_build_compound_diffwtd_mask_d16_neon
void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
-#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c
-
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
-#define av1_calc_frame_error av1_calc_frame_error_c
+void av1_build_compound_diffwtd_mask_highbd_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_neon
void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
@@ -157,35 +170,39 @@ void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint
void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim2 av1_calc_indices_dim2_neon
-void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
-#define av1_calc_proj_params av1_calc_proj_params_c
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+#define av1_calc_proj_params av1_calc_proj_params_neon
-void av1_calc_proj_params_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
-#define av1_calc_proj_params_high_bd av1_calc_proj_params_high_bd_c
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+#define av1_calc_proj_params_high_bd av1_calc_proj_params_high_bd_neon
-void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+void av1_cnn_activate_c(float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
#define av1_cnn_activate av1_cnn_activate_c
-void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+void av1_cnn_add_c(float **input, int channels, int width, int height, int stride, const float **add);
#define av1_cnn_add av1_cnn_add_c
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
#define av1_cnn_batchnorm av1_cnn_batchnorm_c
-void av1_cnn_convolve_no_maxpool_padding_valid_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
+void av1_cnn_convolve_no_maxpool_padding_valid_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
#define av1_cnn_convolve_no_maxpool_padding_valid av1_cnn_convolve_no_maxpool_padding_valid_c
-void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
#define av1_cnn_deconvolve av1_cnn_deconvolve_c
-bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
#define av1_cnn_predict av1_cnn_predict_c
void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
-#define av1_compute_stats av1_compute_stats_c
+void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
+#define av1_compute_stats av1_compute_stats_neon
void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth);
-#define av1_compute_stats_highbd av1_compute_stats_highbd_c
+void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth);
+#define av1_compute_stats_highbd av1_compute_stats_highbd_neon
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_scale av1_convolve_2d_scale_c
@@ -194,6 +211,10 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_sr av1_convolve_2d_sr_neon
+void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_sr_intrabc av1_convolve_2d_sr_intrabc_neon
+
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
#define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
@@ -201,10 +222,18 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
#define av1_convolve_x_sr av1_convolve_x_sr_neon
+void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
+void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
+#define av1_convolve_x_sr_intrabc av1_convolve_x_sr_intrabc_neon
+
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
#define av1_convolve_y_sr av1_convolve_y_sr_neon
+void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_neon
+
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
@@ -234,13 +263,12 @@ void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, c
#define av1_dr_prediction_z3 av1_dr_prediction_z3_neon
double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, int width, int stride, int edge_thresh);
-#define av1_estimate_noise_from_single_plane av1_estimate_noise_from_single_plane_c
+double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height, int width, int stride, int edge_thresh);
+#define av1_estimate_noise_from_single_plane av1_estimate_noise_from_single_plane_neon
void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
-#define av1_filter_intra_edge av1_filter_intra_edge_c
-
-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
-#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
+void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength);
+#define av1_filter_intra_edge av1_filter_intra_edge_neon
void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
@@ -329,8 +357,8 @@ void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride);
uint32_t av1_get_crc32c_value_c(void *crc_calculator, uint8_t *p, size_t length);
#define av1_get_crc32c_value av1_get_crc32c_value_c
-void av1_get_horver_correlation_full_c( const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
-void av1_get_horver_correlation_full_neon( const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
+void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
#define av1_get_horver_correlation_full av1_get_horver_correlation_full_neon
void av1_get_nz_map_contexts_c(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts);
@@ -338,10 +366,12 @@ void av1_get_nz_map_contexts_neon(const uint8_t *const levels, const int16_t *co
#define av1_get_nz_map_contexts av1_get_nz_map_contexts_neon
void av1_highbd_apply_temporal_filter_c(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
-#define av1_highbd_apply_temporal_filter av1_highbd_apply_temporal_filter_c
+void av1_highbd_apply_temporal_filter_neon(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+#define av1_highbd_apply_temporal_filter av1_highbd_apply_temporal_filter_neon
int64_t av1_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
-#define av1_highbd_block_error av1_highbd_block_error_c
+int64_t av1_highbd_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define av1_highbd_block_error av1_highbd_block_error_neon
void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define av1_highbd_convolve8 av1_highbd_convolve8_c
@@ -360,6 +390,10 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *
void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_neon
+void av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_intrabc_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_sr_intrabc av1_highbd_convolve_2d_sr_intrabc_neon
+
void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@@ -374,10 +408,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_neon
+void av1_highbd_convolve_x_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_intrabc_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_x_sr_intrabc av1_highbd_convolve_x_sr_intrabc_neon
+
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_neon
+void av1_highbd_convolve_y_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
+void av1_highbd_convolve_y_sr_intrabc_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
+#define av1_highbd_convolve_y_sr_intrabc av1_highbd_convolve_y_sr_intrabc_neon
+
void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
void av1_highbd_dist_wtd_convolve_2d_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_neon
@@ -404,7 +446,12 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh);
-#define av1_highbd_estimate_noise_from_single_plane av1_highbd_estimate_noise_from_single_plane_c
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh);
+#define av1_highbd_estimate_noise_from_single_plane av1_highbd_estimate_noise_from_single_plane_neon
+
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength);
+void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength);
+#define av1_highbd_filter_intra_edge av1_highbd_filter_intra_edge_neon
void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
@@ -484,18 +531,24 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-int64_t av1_highbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_highbd_pixel_proj_error av1_highbd_pixel_proj_error_c
void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
void av1_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
#define av1_highbd_quantize_fp av1_highbd_quantize_fp_neon
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd);
+void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd);
+#define av1_highbd_upsample_intra_edge av1_highbd_upsample_intra_edge_neon
+
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
-#define av1_highbd_warp_affine av1_highbd_warp_affine_c
+void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+#define av1_highbd_warp_affine av1_highbd_warp_affine_neon
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
-#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_c
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
+#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_neon
void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
@@ -580,15 +633,15 @@ void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_s
void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
#define av1_lowbd_fwd_txfm av1_lowbd_fwd_txfm_neon
-int64_t av1_lowbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
-int64_t av1_lowbd_pixel_proj_error_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_lowbd_pixel_proj_error_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_lowbd_pixel_proj_error av1_lowbd_pixel_proj_error_neon
-void av1_nn_fast_softmax_16_c( const float *input_nodes, float *output);
+void av1_nn_fast_softmax_16_c(const float *input_nodes, float *output);
#define av1_nn_fast_softmax_16 av1_nn_fast_softmax_16_c
-void av1_nn_predict_c( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
-void av1_nn_predict_neon( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_c(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_neon(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
#define av1_nn_predict av1_nn_predict_neon
void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
@@ -631,27 +684,27 @@ void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width, co
#define av1_txb_init_levels av1_txb_init_levels_neon
void av1_upsample_intra_edge_c(uint8_t *p, int sz);
-#define av1_upsample_intra_edge av1_upsample_intra_edge_c
-
-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
-#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
+void av1_upsample_intra_edge_neon(uint8_t *p, int sz);
+#define av1_upsample_intra_edge av1_upsample_intra_edge_neon
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_warp_affine av1_warp_affine_neon
void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, const int16_t *b, int N);
-#define av1_wedge_compute_delta_squares av1_wedge_compute_delta_squares_c
+void av1_wedge_compute_delta_squares_neon(int16_t *d, const int16_t *a, const int16_t *b, int N);
+#define av1_wedge_compute_delta_squares av1_wedge_compute_delta_squares_neon
int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N, int64_t limit);
-#define av1_wedge_sign_from_residuals av1_wedge_sign_from_residuals_c
+int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m, int N, int64_t limit);
+#define av1_wedge_sign_from_residuals av1_wedge_sign_from_residuals_neon
uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_neon
-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
-void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
#define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_neon
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
diff --git a/config/config/aom_version.h b/config/config/aom_version.h
index c3705db8b..51e763c88 100644
--- a/config/config/aom_version.h
+++ b/config/config/aom_version.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -10,10 +10,10 @@
*/
#define VERSION_MAJOR 3
-#define VERSION_MINOR 7
-#define VERSION_PATCH 0
-#define VERSION_EXTRA "273-g722272fc9"
+#define VERSION_MINOR 8
+#define VERSION_PATCH 1
+#define VERSION_EXTRA "324-gd0d0651f9b"
#define VERSION_PACKED \
((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))
-#define VERSION_STRING_NOSP "3.7.0-273-g722272fc9"
-#define VERSION_STRING " 3.7.0-273-g722272fc9"
+#define VERSION_STRING_NOSP "3.8.1-324-gd0d0651f9b"
+#define VERSION_STRING " 3.8.1-324-gd0d0651f9b"
diff --git a/config/riscv64/config/aom_config.asm b/config/riscv64/config/aom_config.asm
index 02ff408e5..594d55d36 100644
--- a/config/riscv64/config/aom_config.asm
+++ b/config/riscv64/config/aom_config.asm
@@ -1,5 +1,5 @@
;
-; Copyright (c) 2023, Alliance for Open Media. All rights reserved
+; Copyright (c) 2024, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -76,6 +76,8 @@ HAVE_AVX2 equ 0
HAVE_FEXCEPT equ 1
HAVE_MMX equ 0
HAVE_NEON equ 0
+HAVE_NEON_DOTPROD equ 0
+HAVE_NEON_I8MM equ 0
HAVE_PTHREAD_H equ 1
HAVE_SSE equ 0
HAVE_SSE2 equ 0
@@ -83,6 +85,7 @@ HAVE_SSE3 equ 0
HAVE_SSE4_1 equ 0
HAVE_SSE4_2 equ 0
HAVE_SSSE3 equ 0
+HAVE_SVE equ 0
HAVE_UNISTD_H equ 1
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
diff --git a/config/riscv64/config/aom_config.c b/config/riscv64/config/aom_config.c
index 07609ac79..c78534672 100644
--- a/config/riscv64/config/aom_config.c
+++ b/config/riscv64/config/aom_config.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/config/riscv64/config/aom_config.h b/config/riscv64/config/aom_config.h
index 91b6249d2..9c378999b 100644
--- a/config/riscv64/config/aom_config.h
+++ b/config/riscv64/config/aom_config.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -78,6 +78,8 @@
#define HAVE_FEXCEPT 1
#define HAVE_MMX 0
#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
#define HAVE_PTHREAD_H 1
#define HAVE_SSE 0
#define HAVE_SSE2 0
@@ -85,6 +87,7 @@
#define HAVE_SSE4_1 0
#define HAVE_SSE4_2 0
#define HAVE_SSSE3 0
+#define HAVE_SVE 0
#define HAVE_UNISTD_H 1
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
diff --git a/config/riscv64/config/aom_dsp_rtcd.h b/config/riscv64/config/aom_dsp_rtcd.h
index e724d0d17..0418b3568 100644
--- a/config/riscv64/config/aom_dsp_rtcd.h
+++ b/config/riscv64/config/aom_dsp_rtcd.h
@@ -943,79 +943,70 @@ uint32_t aom_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int sourc
uint32_t aom_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_sub_pixel_variance8x8 aom_highbd_10_sub_pixel_variance8x8_c
-unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x128 aom_highbd_10_variance128x128_c
-unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x64 aom_highbd_10_variance128x64_c
-unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x16 aom_highbd_10_variance16x16_c
-unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x32 aom_highbd_10_variance16x32_c
-unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x4 aom_highbd_10_variance16x4_c
-unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x64 aom_highbd_10_variance16x64_c
-unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x8 aom_highbd_10_variance16x8_c
-unsigned int aom_highbd_10_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x2 aom_highbd_10_variance2x2_c
-
-unsigned int aom_highbd_10_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x4 aom_highbd_10_variance2x4_c
-
-unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x16 aom_highbd_10_variance32x16_c
-unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x32 aom_highbd_10_variance32x32_c
-unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x64 aom_highbd_10_variance32x64_c
-unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x8 aom_highbd_10_variance32x8_c
-unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x16 aom_highbd_10_variance4x16_c
-unsigned int aom_highbd_10_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance4x2 aom_highbd_10_variance4x2_c
-
-unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x4 aom_highbd_10_variance4x4_c
-unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x8 aom_highbd_10_variance4x8_c
-unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x128 aom_highbd_10_variance64x128_c
-unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x16 aom_highbd_10_variance64x16_c
-unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x32 aom_highbd_10_variance64x32_c
-unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x64 aom_highbd_10_variance64x64_c
-unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x16 aom_highbd_10_variance8x16_c
-unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x32 aom_highbd_10_variance8x32_c
-unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x4 aom_highbd_10_variance8x4_c
-unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x8 aom_highbd_10_variance8x8_c
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
@@ -1426,79 +1417,70 @@ uint32_t aom_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int sourc
uint32_t aom_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_sub_pixel_variance8x8 aom_highbd_12_sub_pixel_variance8x8_c
-unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x128 aom_highbd_12_variance128x128_c
-unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x64 aom_highbd_12_variance128x64_c
-unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x16 aom_highbd_12_variance16x16_c
-unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x32 aom_highbd_12_variance16x32_c
-unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x4 aom_highbd_12_variance16x4_c
-unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x64 aom_highbd_12_variance16x64_c
-unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x8 aom_highbd_12_variance16x8_c
-unsigned int aom_highbd_12_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x2 aom_highbd_12_variance2x2_c
-
-unsigned int aom_highbd_12_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x4 aom_highbd_12_variance2x4_c
-
-unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x16 aom_highbd_12_variance32x16_c
-unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x32 aom_highbd_12_variance32x32_c
-unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x64 aom_highbd_12_variance32x64_c
-unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x8 aom_highbd_12_variance32x8_c
-unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x16 aom_highbd_12_variance4x16_c
-unsigned int aom_highbd_12_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance4x2 aom_highbd_12_variance4x2_c
-
-unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x4 aom_highbd_12_variance4x4_c
-unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x8 aom_highbd_12_variance4x8_c
-unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x128 aom_highbd_12_variance64x128_c
-unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x16 aom_highbd_12_variance64x16_c
-unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x32 aom_highbd_12_variance64x32_c
-unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x64 aom_highbd_12_variance64x64_c
-unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x16 aom_highbd_12_variance8x16_c
-unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x32 aom_highbd_12_variance8x32_c
-unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x4 aom_highbd_12_variance8x4_c
-unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x8 aom_highbd_12_variance8x8_c
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
@@ -1645,6 +1627,138 @@ unsigned int aom_highbd_8_mse8x16_c(const uint8_t *src_ptr, int source_stride,
unsigned int aom_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_8_mse8x8 aom_highbd_8_mse8x8_c
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x128 aom_highbd_8_obmc_sub_pixel_variance128x128_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x64 aom_highbd_8_obmc_sub_pixel_variance128x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x16 aom_highbd_8_obmc_sub_pixel_variance16x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x32 aom_highbd_8_obmc_sub_pixel_variance16x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x4 aom_highbd_8_obmc_sub_pixel_variance16x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x64 aom_highbd_8_obmc_sub_pixel_variance16x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x8 aom_highbd_8_obmc_sub_pixel_variance16x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x16 aom_highbd_8_obmc_sub_pixel_variance32x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x32 aom_highbd_8_obmc_sub_pixel_variance32x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x64 aom_highbd_8_obmc_sub_pixel_variance32x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x8 aom_highbd_8_obmc_sub_pixel_variance32x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x16 aom_highbd_8_obmc_sub_pixel_variance4x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x4 aom_highbd_8_obmc_sub_pixel_variance4x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x8 aom_highbd_8_obmc_sub_pixel_variance4x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x128 aom_highbd_8_obmc_sub_pixel_variance64x128_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x16 aom_highbd_8_obmc_sub_pixel_variance64x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x32 aom_highbd_8_obmc_sub_pixel_variance64x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x64 aom_highbd_8_obmc_sub_pixel_variance64x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x16 aom_highbd_8_obmc_sub_pixel_variance8x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x32 aom_highbd_8_obmc_sub_pixel_variance8x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x4 aom_highbd_8_obmc_sub_pixel_variance8x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x8 aom_highbd_8_obmc_sub_pixel_variance8x8_c
+
+unsigned int aom_highbd_8_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x128 aom_highbd_8_obmc_variance128x128_c
+
+unsigned int aom_highbd_8_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x64 aom_highbd_8_obmc_variance128x64_c
+
+unsigned int aom_highbd_8_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x16 aom_highbd_8_obmc_variance16x16_c
+
+unsigned int aom_highbd_8_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x32 aom_highbd_8_obmc_variance16x32_c
+
+unsigned int aom_highbd_8_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x4 aom_highbd_8_obmc_variance16x4_c
+
+unsigned int aom_highbd_8_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x64 aom_highbd_8_obmc_variance16x64_c
+
+unsigned int aom_highbd_8_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x8 aom_highbd_8_obmc_variance16x8_c
+
+unsigned int aom_highbd_8_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x16 aom_highbd_8_obmc_variance32x16_c
+
+unsigned int aom_highbd_8_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x32 aom_highbd_8_obmc_variance32x32_c
+
+unsigned int aom_highbd_8_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x64 aom_highbd_8_obmc_variance32x64_c
+
+unsigned int aom_highbd_8_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x8 aom_highbd_8_obmc_variance32x8_c
+
+unsigned int aom_highbd_8_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x16 aom_highbd_8_obmc_variance4x16_c
+
+unsigned int aom_highbd_8_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x4 aom_highbd_8_obmc_variance4x4_c
+
+unsigned int aom_highbd_8_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x8 aom_highbd_8_obmc_variance4x8_c
+
+unsigned int aom_highbd_8_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x128 aom_highbd_8_obmc_variance64x128_c
+
+unsigned int aom_highbd_8_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x16 aom_highbd_8_obmc_variance64x16_c
+
+unsigned int aom_highbd_8_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x32 aom_highbd_8_obmc_variance64x32_c
+
+unsigned int aom_highbd_8_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x64 aom_highbd_8_obmc_variance64x64_c
+
+unsigned int aom_highbd_8_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x16 aom_highbd_8_obmc_variance8x16_c
+
+unsigned int aom_highbd_8_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x32 aom_highbd_8_obmc_variance8x32_c
+
+unsigned int aom_highbd_8_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x4 aom_highbd_8_obmc_variance8x4_c
+
+unsigned int aom_highbd_8_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x8 aom_highbd_8_obmc_variance8x8_c
+
uint32_t aom_highbd_8_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
#define aom_highbd_8_sub_pixel_avg_variance128x128 aom_highbd_8_sub_pixel_avg_variance128x128_c
@@ -1777,79 +1891,70 @@ uint32_t aom_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source
uint32_t aom_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_sub_pixel_variance8x8 aom_highbd_8_sub_pixel_variance8x8_c
-unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x128 aom_highbd_8_variance128x128_c
-unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x64 aom_highbd_8_variance128x64_c
-unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x16 aom_highbd_8_variance16x16_c
-unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x32 aom_highbd_8_variance16x32_c
-unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x4 aom_highbd_8_variance16x4_c
-unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x64 aom_highbd_8_variance16x64_c
-unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x8 aom_highbd_8_variance16x8_c
-unsigned int aom_highbd_8_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x2 aom_highbd_8_variance2x2_c
-
-unsigned int aom_highbd_8_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x4 aom_highbd_8_variance2x4_c
-
-unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x16 aom_highbd_8_variance32x16_c
-unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x32 aom_highbd_8_variance32x32_c
-unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x64 aom_highbd_8_variance32x64_c
-unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x8 aom_highbd_8_variance32x8_c
-unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x16 aom_highbd_8_variance4x16_c
-unsigned int aom_highbd_8_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance4x2 aom_highbd_8_variance4x2_c
-
-unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x4 aom_highbd_8_variance4x4_c
-unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x8 aom_highbd_8_variance4x8_c
-unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x128 aom_highbd_8_variance64x128_c
-unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x16 aom_highbd_8_variance64x16_c
-unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x32 aom_highbd_8_variance64x32_c
-unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x64 aom_highbd_8_variance64x64_c
-unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x16 aom_highbd_8_variance8x16_c
-unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x32 aom_highbd_8_variance8x32_c
-unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x4 aom_highbd_8_variance8x4_c
-unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x8 aom_highbd_8_variance8x8_c
unsigned int aom_highbd_avg_4x4_c(const uint8_t *, int p);
@@ -2431,138 +2536,6 @@ unsigned int aom_highbd_obmc_sad8x4_c(const uint8_t *pre, int pre_stride, const
unsigned int aom_highbd_obmc_sad8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
#define aom_highbd_obmc_sad8x8 aom_highbd_obmc_sad8x8_c
-unsigned int aom_highbd_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x128 aom_highbd_obmc_sub_pixel_variance128x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x64 aom_highbd_obmc_sub_pixel_variance128x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x16 aom_highbd_obmc_sub_pixel_variance16x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x32 aom_highbd_obmc_sub_pixel_variance16x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x4 aom_highbd_obmc_sub_pixel_variance16x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x64 aom_highbd_obmc_sub_pixel_variance16x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x8 aom_highbd_obmc_sub_pixel_variance16x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x16 aom_highbd_obmc_sub_pixel_variance32x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x32 aom_highbd_obmc_sub_pixel_variance32x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x64 aom_highbd_obmc_sub_pixel_variance32x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x8 aom_highbd_obmc_sub_pixel_variance32x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x16 aom_highbd_obmc_sub_pixel_variance4x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x4 aom_highbd_obmc_sub_pixel_variance4x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x8 aom_highbd_obmc_sub_pixel_variance4x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x128 aom_highbd_obmc_sub_pixel_variance64x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x16 aom_highbd_obmc_sub_pixel_variance64x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x32 aom_highbd_obmc_sub_pixel_variance64x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x64 aom_highbd_obmc_sub_pixel_variance64x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x16 aom_highbd_obmc_sub_pixel_variance8x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x32 aom_highbd_obmc_sub_pixel_variance8x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x4 aom_highbd_obmc_sub_pixel_variance8x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x8 aom_highbd_obmc_sub_pixel_variance8x8_c
-
-unsigned int aom_highbd_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x128 aom_highbd_obmc_variance128x128_c
-
-unsigned int aom_highbd_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x64 aom_highbd_obmc_variance128x64_c
-
-unsigned int aom_highbd_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x16 aom_highbd_obmc_variance16x16_c
-
-unsigned int aom_highbd_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x32 aom_highbd_obmc_variance16x32_c
-
-unsigned int aom_highbd_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x4 aom_highbd_obmc_variance16x4_c
-
-unsigned int aom_highbd_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x64 aom_highbd_obmc_variance16x64_c
-
-unsigned int aom_highbd_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x8 aom_highbd_obmc_variance16x8_c
-
-unsigned int aom_highbd_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x16 aom_highbd_obmc_variance32x16_c
-
-unsigned int aom_highbd_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x32 aom_highbd_obmc_variance32x32_c
-
-unsigned int aom_highbd_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x64 aom_highbd_obmc_variance32x64_c
-
-unsigned int aom_highbd_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x8 aom_highbd_obmc_variance32x8_c
-
-unsigned int aom_highbd_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x16 aom_highbd_obmc_variance4x16_c
-
-unsigned int aom_highbd_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x4 aom_highbd_obmc_variance4x4_c
-
-unsigned int aom_highbd_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x8 aom_highbd_obmc_variance4x8_c
-
-unsigned int aom_highbd_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x128 aom_highbd_obmc_variance64x128_c
-
-unsigned int aom_highbd_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x16 aom_highbd_obmc_variance64x16_c
-
-unsigned int aom_highbd_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x32 aom_highbd_obmc_variance64x32_c
-
-unsigned int aom_highbd_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x64 aom_highbd_obmc_variance64x64_c
-
-unsigned int aom_highbd_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x16 aom_highbd_obmc_variance8x16_c
-
-unsigned int aom_highbd_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x32 aom_highbd_obmc_variance8x32_c
-
-unsigned int aom_highbd_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x4 aom_highbd_obmc_variance8x4_c
-
-unsigned int aom_highbd_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x8 aom_highbd_obmc_variance8x8_c
-
void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
@@ -2644,10 +2617,10 @@ unsigned int aom_highbd_sad128x128_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
-void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
-void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_c
unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2656,10 +2629,10 @@ unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, cons
unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
-void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
-void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_c
unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2668,10 +2641,10 @@ unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_c
-void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
-void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_c
unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2680,10 +2653,10 @@ unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_c
-void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
-void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_c
unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2692,10 +2665,10 @@ unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_c
-void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
-void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_c
unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2704,10 +2677,10 @@ unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_c
-void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
-void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_c
unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2716,10 +2689,10 @@ unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_c
-void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
-void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_c
unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2728,10 +2701,10 @@ unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_c
-void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
-void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_c
unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2740,10 +2713,10 @@ unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_c
-void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
-void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_c
unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2752,10 +2725,10 @@ unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_c
-void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
-void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_c
unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2764,10 +2737,10 @@ unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_c
-void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
-void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_c
unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2776,10 +2749,10 @@ unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_c
-void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
-void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_c
unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2788,10 +2761,10 @@ unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u
unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_c
-void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
-void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_c
unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2800,10 +2773,10 @@ unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u
unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_c
-void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
-void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_c
unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2812,10 +2785,10 @@ unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, cons
unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
-void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
-void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_c
unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2824,10 +2797,10 @@ unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_c
-void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
-void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_c
unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2836,10 +2809,10 @@ unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_c
-void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
-void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_c
unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2848,10 +2821,10 @@ unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_c
-void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
-void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_c
unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2860,10 +2833,10 @@ unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_c
-void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
-void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_c
unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2872,10 +2845,10 @@ unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const
unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_c
-void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
-void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_c
unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2884,10 +2857,10 @@ unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const u
unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_c
-void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
-void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_c
unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -2896,142 +2869,142 @@ unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const u
unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_c
-void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
-void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_c
unsigned int aom_highbd_sad_skip_128x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x128 aom_highbd_sad_skip_128x128_c
-void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x128x4d aom_highbd_sad_skip_128x128x4d_c
unsigned int aom_highbd_sad_skip_128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x64 aom_highbd_sad_skip_128x64_c
-void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x64x4d aom_highbd_sad_skip_128x64x4d_c
unsigned int aom_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x16 aom_highbd_sad_skip_16x16_c
-void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x16x4d aom_highbd_sad_skip_16x16x4d_c
unsigned int aom_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x32 aom_highbd_sad_skip_16x32_c
-void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x32x4d aom_highbd_sad_skip_16x32x4d_c
unsigned int aom_highbd_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x4 aom_highbd_sad_skip_16x4_c
-void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x4x4d aom_highbd_sad_skip_16x4x4d_c
unsigned int aom_highbd_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x64 aom_highbd_sad_skip_16x64_c
-void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x64x4d aom_highbd_sad_skip_16x64x4d_c
unsigned int aom_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x8 aom_highbd_sad_skip_16x8_c
-void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x8x4d aom_highbd_sad_skip_16x8x4d_c
unsigned int aom_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x16 aom_highbd_sad_skip_32x16_c
-void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x16x4d aom_highbd_sad_skip_32x16x4d_c
unsigned int aom_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x32 aom_highbd_sad_skip_32x32_c
-void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x32x4d aom_highbd_sad_skip_32x32x4d_c
unsigned int aom_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x64 aom_highbd_sad_skip_32x64_c
-void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x64x4d aom_highbd_sad_skip_32x64x4d_c
unsigned int aom_highbd_sad_skip_32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x8 aom_highbd_sad_skip_32x8_c
-void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x8x4d aom_highbd_sad_skip_32x8x4d_c
unsigned int aom_highbd_sad_skip_4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x16 aom_highbd_sad_skip_4x16_c
-void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x16x4d aom_highbd_sad_skip_4x16x4d_c
unsigned int aom_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x4 aom_highbd_sad_skip_4x4_c
-void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x4x4d aom_highbd_sad_skip_4x4x4d_c
unsigned int aom_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x8 aom_highbd_sad_skip_4x8_c
-void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x8x4d aom_highbd_sad_skip_4x8x4d_c
unsigned int aom_highbd_sad_skip_64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x128 aom_highbd_sad_skip_64x128_c
-void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x128x4d aom_highbd_sad_skip_64x128x4d_c
unsigned int aom_highbd_sad_skip_64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x16 aom_highbd_sad_skip_64x16_c
-void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x16x4d aom_highbd_sad_skip_64x16x4d_c
unsigned int aom_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x32 aom_highbd_sad_skip_64x32_c
-void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x32x4d aom_highbd_sad_skip_64x32x4d_c
unsigned int aom_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x64 aom_highbd_sad_skip_64x64_c
-void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x64x4d aom_highbd_sad_skip_64x64x4d_c
unsigned int aom_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x16 aom_highbd_sad_skip_8x16_c
-void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x16x4d aom_highbd_sad_skip_8x16x4d_c
unsigned int aom_highbd_sad_skip_8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x32 aom_highbd_sad_skip_8x32_c
-void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x32x4d aom_highbd_sad_skip_8x32x4d_c
unsigned int aom_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x4 aom_highbd_sad_skip_8x4_c
-void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x4x4d aom_highbd_sad_skip_8x4x4d_c
unsigned int aom_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x8 aom_highbd_sad_skip_8x8_c
-void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x8x4d aom_highbd_sad_skip_8x8x4d_c
void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -4672,12 +4645,6 @@ unsigned int aom_variance16x64_c(const uint8_t *src_ptr, int source_stride, cons
unsigned int aom_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance16x8 aom_variance16x8_c
-unsigned int aom_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x2 aom_variance2x2_c
-
-unsigned int aom_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x4 aom_variance2x4_c
-
unsigned int aom_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance32x16 aom_variance32x16_c
@@ -4693,9 +4660,6 @@ unsigned int aom_variance32x8_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x16 aom_variance4x16_c
-unsigned int aom_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance4x2 aom_variance4x2_c
-
unsigned int aom_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x4 aom_variance4x4_c
diff --git a/config/riscv64/config/av1_rtcd.h b/config/riscv64/config/av1_rtcd.h
index 3d406ef3c..a6bddea37 100644
--- a/config/riscv64/config/av1_rtcd.h
+++ b/config/riscv64/config/av1_rtcd.h
@@ -112,7 +112,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int
int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search);
#define aom_upsampled_pred aom_upsampled_pred_c
-void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+int av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_c
void av1_apply_temporal_filter_c(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
@@ -133,37 +133,34 @@ void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask
void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
-#define av1_calc_frame_error av1_calc_frame_error_c
-
void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim1 av1_calc_indices_dim1_c
void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim2 av1_calc_indices_dim2_c
-void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
#define av1_calc_proj_params av1_calc_proj_params_c
-void av1_calc_proj_params_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
#define av1_calc_proj_params_high_bd av1_calc_proj_params_high_bd_c
-void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+void av1_cnn_activate_c(float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
#define av1_cnn_activate av1_cnn_activate_c
-void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+void av1_cnn_add_c(float **input, int channels, int width, int height, int stride, const float **add);
#define av1_cnn_add av1_cnn_add_c
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
#define av1_cnn_batchnorm av1_cnn_batchnorm_c
-void av1_cnn_convolve_no_maxpool_padding_valid_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
+void av1_cnn_convolve_no_maxpool_padding_valid_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
#define av1_cnn_convolve_no_maxpool_padding_valid av1_cnn_convolve_no_maxpool_padding_valid_c
-void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
#define av1_cnn_deconvolve av1_cnn_deconvolve_c
-bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
#define av1_cnn_predict av1_cnn_predict_c
void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
@@ -178,15 +175,24 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, i
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_sr av1_convolve_2d_sr_c
+void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_sr_intrabc av1_convolve_2d_sr_intrabc_c
+
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
#define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
#define av1_convolve_x_sr av1_convolve_x_sr_c
+void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
+#define av1_convolve_x_sr_intrabc av1_convolve_x_sr_intrabc_c
+
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
#define av1_convolve_y_sr av1_convolve_y_sr_c
+void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
+
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_c
@@ -214,9 +220,6 @@ double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, in
void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
#define av1_filter_intra_edge av1_filter_intra_edge_c
-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
-#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
-
void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
#define av1_filter_intra_predictor av1_filter_intra_predictor_c
@@ -283,7 +286,7 @@ void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
uint32_t av1_get_crc32c_value_c(void *crc_calculator, uint8_t *p, size_t length);
#define av1_get_crc32c_value av1_get_crc32c_value_c
-void av1_get_horver_correlation_full_c( const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
#define av1_get_horver_correlation_full av1_get_horver_correlation_full_c
void av1_get_nz_map_contexts_c(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts);
@@ -310,6 +313,9 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_
void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_c
+void av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_sr_intrabc av1_highbd_convolve_2d_sr_intrabc_c
+
void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@@ -322,9 +328,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_c
+void av1_highbd_convolve_x_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_x_sr_intrabc av1_highbd_convolve_x_sr_intrabc_c
+
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
+void av1_highbd_convolve_y_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
+#define av1_highbd_convolve_y_sr_intrabc av1_highbd_convolve_y_sr_intrabc_c
+
void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
@@ -349,6 +361,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh);
#define av1_highbd_estimate_noise_from_single_plane av1_highbd_estimate_noise_from_single_plane_c
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength);
+#define av1_highbd_filter_intra_edge av1_highbd_filter_intra_edge_c
+
void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
@@ -412,16 +427,19 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-int64_t av1_highbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_highbd_pixel_proj_error av1_highbd_pixel_proj_error_c
void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
#define av1_highbd_quantize_fp av1_highbd_quantize_fp_c
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd);
+#define av1_highbd_upsample_intra_edge av1_highbd_upsample_intra_edge_c
+
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_highbd_warp_affine av1_highbd_warp_affine_c
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_c
void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
@@ -487,13 +505,13 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
#define av1_lowbd_fwd_txfm av1_lowbd_fwd_txfm_c
-int64_t av1_lowbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_lowbd_pixel_proj_error av1_lowbd_pixel_proj_error_c
-void av1_nn_fast_softmax_16_c( const float *input_nodes, float *output);
+void av1_nn_fast_softmax_16_c(const float *input_nodes, float *output);
#define av1_nn_fast_softmax_16 av1_nn_fast_softmax_16_c
-void av1_nn_predict_c( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_c(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
#define av1_nn_predict av1_nn_predict_c
void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
@@ -528,9 +546,6 @@ void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, const
void av1_upsample_intra_edge_c(uint8_t *p, int sz);
#define av1_upsample_intra_edge av1_upsample_intra_edge_c
-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
-#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
-
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_warp_affine av1_warp_affine_c
@@ -543,7 +558,7 @@ int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int
uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_c
-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
#define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_c
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
diff --git a/config/x86/config/aom_config.asm b/config/x86/config/aom_config.asm
index e01202e02..ee39d1d20 100644
--- a/config/x86/config/aom_config.asm
+++ b/config/x86/config/aom_config.asm
@@ -66,6 +66,8 @@
%define HAVE_FEXCEPT 1
%define HAVE_MMX 1
%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
%define HAVE_PTHREAD_H 1
%define HAVE_SSE 1
%define HAVE_SSE2 1
@@ -73,6 +75,7 @@
%define HAVE_SSE4_1 0
%define HAVE_SSE4_2 0
%define HAVE_SSSE3 1
+%define HAVE_SVE 0
%define HAVE_UNISTD_H 1
%define HAVE_VSX 0
%define HAVE_WXWIDGETS 0
diff --git a/config/x86/config/aom_config.c b/config/x86/config/aom_config.c
index 987319480..46444fa93 100644
--- a/config/x86/config/aom_config.c
+++ b/config/x86/config/aom_config.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,5 +9,5 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "aom/aom_codec.h"
-static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/x86-linux.cmake\" -DCONFIG_AV1_ENCODER=1 -DCONFIG_AV1_HIGHBITDEPTH=1 -DCONFIG_PIC=1 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
+static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/i686-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=1 -DCONFIG_AV1_HIGHBITDEPTH=1 -DCONFIG_PIC=1 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
const char *aom_codec_build_config(void) {return cfg;}
diff --git a/config/x86/config/aom_config.h b/config/x86/config/aom_config.h
index 502262efd..0edc5c526 100644
--- a/config/x86/config/aom_config.h
+++ b/config/x86/config/aom_config.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -78,6 +78,8 @@
#define HAVE_FEXCEPT 1
#define HAVE_MMX 1
#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
#define HAVE_PTHREAD_H 1
#define HAVE_SSE 1
#define HAVE_SSE2 1
@@ -85,6 +87,7 @@
#define HAVE_SSE4_1 0
#define HAVE_SSE4_2 0
#define HAVE_SSSE3 1
+#define HAVE_SVE 0
#define HAVE_UNISTD_H 1
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
diff --git a/config/x86/config/aom_dsp_rtcd.h b/config/x86/config/aom_dsp_rtcd.h
index 4521b9d0d..733718dec 100644
--- a/config/x86/config/aom_dsp_rtcd.h
+++ b/config/x86/config/aom_dsp_rtcd.h
@@ -1168,96 +1168,87 @@ uint32_t aom_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int sourc
uint32_t aom_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_sub_pixel_variance8x8 aom_highbd_10_sub_pixel_variance8x8_sse2
-unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x128 aom_highbd_10_variance128x128_sse2
-unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x64 aom_highbd_10_variance128x64_sse2
-unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x16 aom_highbd_10_variance16x16_sse2
-unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x32 aom_highbd_10_variance16x32_sse2
-unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x4 aom_highbd_10_variance16x4_c
-unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x64 aom_highbd_10_variance16x64_sse2
-unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x8 aom_highbd_10_variance16x8_sse2
-unsigned int aom_highbd_10_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x2 aom_highbd_10_variance2x2_c
-
-unsigned int aom_highbd_10_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x4 aom_highbd_10_variance2x4_c
-
-unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x16 aom_highbd_10_variance32x16_sse2
-unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x32 aom_highbd_10_variance32x32_sse2
-unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x64 aom_highbd_10_variance32x64_sse2
-unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x8 aom_highbd_10_variance32x8_sse2
-unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x16 aom_highbd_10_variance4x16_c
-unsigned int aom_highbd_10_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance4x2 aom_highbd_10_variance4x2_c
-
-unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x4 aom_highbd_10_variance4x4_c
-unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x8 aom_highbd_10_variance4x8_c
-unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x128 aom_highbd_10_variance64x128_sse2
-unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x16 aom_highbd_10_variance64x16_sse2
-unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x32 aom_highbd_10_variance64x32_sse2
-unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x64 aom_highbd_10_variance64x64_sse2
-unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x16 aom_highbd_10_variance8x16_sse2
-unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x32 aom_highbd_10_variance8x32_sse2
-unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x4 aom_highbd_10_variance8x4_c
-unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x8 aom_highbd_10_variance8x8_sse2
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
@@ -1727,96 +1718,87 @@ uint32_t aom_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int sourc
uint32_t aom_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_sub_pixel_variance8x8 aom_highbd_12_sub_pixel_variance8x8_sse2
-unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x128 aom_highbd_12_variance128x128_sse2
-unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x64 aom_highbd_12_variance128x64_sse2
-unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x16 aom_highbd_12_variance16x16_sse2
-unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x32 aom_highbd_12_variance16x32_sse2
-unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x4 aom_highbd_12_variance16x4_c
-unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x64 aom_highbd_12_variance16x64_sse2
-unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x8 aom_highbd_12_variance16x8_sse2
-unsigned int aom_highbd_12_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x2 aom_highbd_12_variance2x2_c
-
-unsigned int aom_highbd_12_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x4 aom_highbd_12_variance2x4_c
-
-unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x16 aom_highbd_12_variance32x16_sse2
-unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x32 aom_highbd_12_variance32x32_sse2
-unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x64 aom_highbd_12_variance32x64_sse2
-unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x8 aom_highbd_12_variance32x8_sse2
-unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x16 aom_highbd_12_variance4x16_c
-unsigned int aom_highbd_12_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance4x2 aom_highbd_12_variance4x2_c
-
-unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x4 aom_highbd_12_variance4x4_c
-unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x8 aom_highbd_12_variance4x8_c
-unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x128 aom_highbd_12_variance64x128_sse2
-unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x16 aom_highbd_12_variance64x16_sse2
-unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x32 aom_highbd_12_variance64x32_sse2
-unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x64 aom_highbd_12_variance64x64_sse2
-unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x16 aom_highbd_12_variance8x16_sse2
-unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x32 aom_highbd_12_variance8x32_sse2
-unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x4 aom_highbd_12_variance8x4_c
-unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x8 aom_highbd_12_variance8x8_sse2
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
@@ -1987,6 +1969,138 @@ unsigned int aom_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, c
unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_8_mse8x8 aom_highbd_8_mse8x8_sse2
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x128 aom_highbd_8_obmc_sub_pixel_variance128x128_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x64 aom_highbd_8_obmc_sub_pixel_variance128x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x16 aom_highbd_8_obmc_sub_pixel_variance16x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x32 aom_highbd_8_obmc_sub_pixel_variance16x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x4 aom_highbd_8_obmc_sub_pixel_variance16x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x64 aom_highbd_8_obmc_sub_pixel_variance16x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x8 aom_highbd_8_obmc_sub_pixel_variance16x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x16 aom_highbd_8_obmc_sub_pixel_variance32x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x32 aom_highbd_8_obmc_sub_pixel_variance32x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x64 aom_highbd_8_obmc_sub_pixel_variance32x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x8 aom_highbd_8_obmc_sub_pixel_variance32x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x16 aom_highbd_8_obmc_sub_pixel_variance4x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x4 aom_highbd_8_obmc_sub_pixel_variance4x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x8 aom_highbd_8_obmc_sub_pixel_variance4x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x128 aom_highbd_8_obmc_sub_pixel_variance64x128_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x16 aom_highbd_8_obmc_sub_pixel_variance64x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x32 aom_highbd_8_obmc_sub_pixel_variance64x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x64 aom_highbd_8_obmc_sub_pixel_variance64x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x16 aom_highbd_8_obmc_sub_pixel_variance8x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x32 aom_highbd_8_obmc_sub_pixel_variance8x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x4 aom_highbd_8_obmc_sub_pixel_variance8x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x8 aom_highbd_8_obmc_sub_pixel_variance8x8_c
+
+unsigned int aom_highbd_8_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x128 aom_highbd_8_obmc_variance128x128_c
+
+unsigned int aom_highbd_8_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x64 aom_highbd_8_obmc_variance128x64_c
+
+unsigned int aom_highbd_8_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x16 aom_highbd_8_obmc_variance16x16_c
+
+unsigned int aom_highbd_8_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x32 aom_highbd_8_obmc_variance16x32_c
+
+unsigned int aom_highbd_8_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x4 aom_highbd_8_obmc_variance16x4_c
+
+unsigned int aom_highbd_8_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x64 aom_highbd_8_obmc_variance16x64_c
+
+unsigned int aom_highbd_8_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x8 aom_highbd_8_obmc_variance16x8_c
+
+unsigned int aom_highbd_8_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x16 aom_highbd_8_obmc_variance32x16_c
+
+unsigned int aom_highbd_8_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x32 aom_highbd_8_obmc_variance32x32_c
+
+unsigned int aom_highbd_8_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x64 aom_highbd_8_obmc_variance32x64_c
+
+unsigned int aom_highbd_8_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x8 aom_highbd_8_obmc_variance32x8_c
+
+unsigned int aom_highbd_8_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x16 aom_highbd_8_obmc_variance4x16_c
+
+unsigned int aom_highbd_8_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x4 aom_highbd_8_obmc_variance4x4_c
+
+unsigned int aom_highbd_8_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x8 aom_highbd_8_obmc_variance4x8_c
+
+unsigned int aom_highbd_8_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x128 aom_highbd_8_obmc_variance64x128_c
+
+unsigned int aom_highbd_8_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x16 aom_highbd_8_obmc_variance64x16_c
+
+unsigned int aom_highbd_8_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x32 aom_highbd_8_obmc_variance64x32_c
+
+unsigned int aom_highbd_8_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x64 aom_highbd_8_obmc_variance64x64_c
+
+unsigned int aom_highbd_8_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x16 aom_highbd_8_obmc_variance8x16_c
+
+unsigned int aom_highbd_8_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x32 aom_highbd_8_obmc_variance8x32_c
+
+unsigned int aom_highbd_8_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x4 aom_highbd_8_obmc_variance8x4_c
+
+unsigned int aom_highbd_8_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x8 aom_highbd_8_obmc_variance8x8_c
+
uint32_t aom_highbd_8_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
#define aom_highbd_8_sub_pixel_avg_variance128x128 aom_highbd_8_sub_pixel_avg_variance128x128_c
@@ -2154,96 +2268,87 @@ uint32_t aom_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source
uint32_t aom_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_sub_pixel_variance8x8 aom_highbd_8_sub_pixel_variance8x8_sse2
-unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x128 aom_highbd_8_variance128x128_sse2
-unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x64 aom_highbd_8_variance128x64_sse2
-unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x16 aom_highbd_8_variance16x16_sse2
-unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x32 aom_highbd_8_variance16x32_sse2
-unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x4 aom_highbd_8_variance16x4_c
-unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x64 aom_highbd_8_variance16x64_sse2
-unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x8 aom_highbd_8_variance16x8_sse2
-unsigned int aom_highbd_8_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x2 aom_highbd_8_variance2x2_c
-
-unsigned int aom_highbd_8_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x4 aom_highbd_8_variance2x4_c
-
-unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x16 aom_highbd_8_variance32x16_sse2
-unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x32 aom_highbd_8_variance32x32_sse2
-unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x64 aom_highbd_8_variance32x64_sse2
-unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x8 aom_highbd_8_variance32x8_sse2
-unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x16 aom_highbd_8_variance4x16_c
-unsigned int aom_highbd_8_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance4x2 aom_highbd_8_variance4x2_c
-
-unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x4 aom_highbd_8_variance4x4_c
-unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x8 aom_highbd_8_variance4x8_c
-unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x128 aom_highbd_8_variance64x128_sse2
-unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x16 aom_highbd_8_variance64x16_sse2
-unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x32 aom_highbd_8_variance64x32_sse2
-unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x64 aom_highbd_8_variance64x64_sse2
-unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x16 aom_highbd_8_variance8x16_sse2
-unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x32 aom_highbd_8_variance8x32_sse2
-unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x4 aom_highbd_8_variance8x4_c
-unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x8 aom_highbd_8_variance8x8_sse2
unsigned int aom_highbd_avg_4x4_c(const uint8_t *, int p);
@@ -2918,138 +3023,6 @@ unsigned int aom_highbd_obmc_sad8x4_c(const uint8_t *pre, int pre_stride, const
unsigned int aom_highbd_obmc_sad8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
#define aom_highbd_obmc_sad8x8 aom_highbd_obmc_sad8x8_c
-unsigned int aom_highbd_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x128 aom_highbd_obmc_sub_pixel_variance128x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x64 aom_highbd_obmc_sub_pixel_variance128x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x16 aom_highbd_obmc_sub_pixel_variance16x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x32 aom_highbd_obmc_sub_pixel_variance16x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x4 aom_highbd_obmc_sub_pixel_variance16x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x64 aom_highbd_obmc_sub_pixel_variance16x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x8 aom_highbd_obmc_sub_pixel_variance16x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x16 aom_highbd_obmc_sub_pixel_variance32x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x32 aom_highbd_obmc_sub_pixel_variance32x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x64 aom_highbd_obmc_sub_pixel_variance32x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x8 aom_highbd_obmc_sub_pixel_variance32x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x16 aom_highbd_obmc_sub_pixel_variance4x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x4 aom_highbd_obmc_sub_pixel_variance4x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x8 aom_highbd_obmc_sub_pixel_variance4x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x128 aom_highbd_obmc_sub_pixel_variance64x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x16 aom_highbd_obmc_sub_pixel_variance64x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x32 aom_highbd_obmc_sub_pixel_variance64x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x64 aom_highbd_obmc_sub_pixel_variance64x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x16 aom_highbd_obmc_sub_pixel_variance8x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x32 aom_highbd_obmc_sub_pixel_variance8x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x4 aom_highbd_obmc_sub_pixel_variance8x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x8 aom_highbd_obmc_sub_pixel_variance8x8_c
-
-unsigned int aom_highbd_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x128 aom_highbd_obmc_variance128x128_c
-
-unsigned int aom_highbd_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x64 aom_highbd_obmc_variance128x64_c
-
-unsigned int aom_highbd_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x16 aom_highbd_obmc_variance16x16_c
-
-unsigned int aom_highbd_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x32 aom_highbd_obmc_variance16x32_c
-
-unsigned int aom_highbd_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x4 aom_highbd_obmc_variance16x4_c
-
-unsigned int aom_highbd_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x64 aom_highbd_obmc_variance16x64_c
-
-unsigned int aom_highbd_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x8 aom_highbd_obmc_variance16x8_c
-
-unsigned int aom_highbd_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x16 aom_highbd_obmc_variance32x16_c
-
-unsigned int aom_highbd_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x32 aom_highbd_obmc_variance32x32_c
-
-unsigned int aom_highbd_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x64 aom_highbd_obmc_variance32x64_c
-
-unsigned int aom_highbd_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x8 aom_highbd_obmc_variance32x8_c
-
-unsigned int aom_highbd_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x16 aom_highbd_obmc_variance4x16_c
-
-unsigned int aom_highbd_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x4 aom_highbd_obmc_variance4x4_c
-
-unsigned int aom_highbd_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x8 aom_highbd_obmc_variance4x8_c
-
-unsigned int aom_highbd_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x128 aom_highbd_obmc_variance64x128_c
-
-unsigned int aom_highbd_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x16 aom_highbd_obmc_variance64x16_c
-
-unsigned int aom_highbd_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x32 aom_highbd_obmc_variance64x32_c
-
-unsigned int aom_highbd_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x64 aom_highbd_obmc_variance64x64_c
-
-unsigned int aom_highbd_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x16 aom_highbd_obmc_variance8x16_c
-
-unsigned int aom_highbd_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x32 aom_highbd_obmc_variance8x32_c
-
-unsigned int aom_highbd_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x4 aom_highbd_obmc_variance8x4_c
-
-unsigned int aom_highbd_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x8 aom_highbd_obmc_variance8x8_c
-
void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
@@ -3137,10 +3110,10 @@ unsigned int aom_highbd_sad128x128_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
-void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
-void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_c
unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3149,10 +3122,10 @@ unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, cons
unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
-void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
-void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_c
unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3163,11 +3136,11 @@ unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_sse2
-void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
-void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_sse2
unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3178,11 +3151,11 @@ unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_sse2
-void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
-void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_sse2
unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3193,11 +3166,11 @@ unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_sse2
-void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
-void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_sse2
unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3208,11 +3181,11 @@ unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad16x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_sse2
-void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
-void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_sse2
unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3223,11 +3196,11 @@ unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_sse2
-void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
-void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_sse2
unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3238,11 +3211,11 @@ unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_sse2
-void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
-void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_sse2
unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3253,11 +3226,11 @@ unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_sse2
-void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
-void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_sse2
unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3268,11 +3241,11 @@ unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_sse2
-void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
-void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_sse2
unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3283,11 +3256,11 @@ unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad32x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_sse2
-void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
-void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_sse2
unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3298,11 +3271,11 @@ unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad4x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_sse2
-void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
-void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_sse2
unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3313,11 +3286,11 @@ unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_sse2
-void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
-void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_sse2
unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3328,11 +3301,11 @@ unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_sse2
-void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
-void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_sse2
unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3341,10 +3314,10 @@ unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, cons
unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
-void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
-void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_c
unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3355,11 +3328,11 @@ unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad64x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_sse2
-void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
-void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_sse2
unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3370,11 +3343,11 @@ unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_sse2
-void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
-void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_sse2
unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3385,11 +3358,11 @@ unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_sse2
-void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
-void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_sse2
unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3400,11 +3373,11 @@ unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_sse2
-void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
-void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_sse2
unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3415,11 +3388,11 @@ unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad8x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_sse2
-void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
-void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_sse2
unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3430,11 +3403,11 @@ unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_sse2
-void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
-void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_sse2
unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3445,175 +3418,175 @@ unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_sse2
-void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
-void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_sse2
unsigned int aom_highbd_sad_skip_128x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x128 aom_highbd_sad_skip_128x128_c
-void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x128x4d aom_highbd_sad_skip_128x128x4d_c
unsigned int aom_highbd_sad_skip_128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x64 aom_highbd_sad_skip_128x64_c
-void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x64x4d aom_highbd_sad_skip_128x64x4d_c
unsigned int aom_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x16 aom_highbd_sad_skip_16x16_sse2
-void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x16x4d aom_highbd_sad_skip_16x16x4d_sse2
unsigned int aom_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x32 aom_highbd_sad_skip_16x32_sse2
-void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x32x4d aom_highbd_sad_skip_16x32x4d_sse2
unsigned int aom_highbd_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x4 aom_highbd_sad_skip_16x4_c
-void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x4x4d aom_highbd_sad_skip_16x4x4d_c
unsigned int aom_highbd_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x64 aom_highbd_sad_skip_16x64_sse2
-void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x64x4d aom_highbd_sad_skip_16x64x4d_sse2
unsigned int aom_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x8 aom_highbd_sad_skip_16x8_sse2
-void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x8x4d aom_highbd_sad_skip_16x8x4d_sse2
unsigned int aom_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x16 aom_highbd_sad_skip_32x16_sse2
-void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x16x4d aom_highbd_sad_skip_32x16x4d_sse2
unsigned int aom_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x32 aom_highbd_sad_skip_32x32_sse2
-void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x32x4d aom_highbd_sad_skip_32x32x4d_sse2
unsigned int aom_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x64 aom_highbd_sad_skip_32x64_sse2
-void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x64x4d aom_highbd_sad_skip_32x64x4d_sse2
unsigned int aom_highbd_sad_skip_32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x8 aom_highbd_sad_skip_32x8_sse2
-void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x8x4d aom_highbd_sad_skip_32x8x4d_sse2
unsigned int aom_highbd_sad_skip_4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x16 aom_highbd_sad_skip_4x16_sse2
-void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x16x4d aom_highbd_sad_skip_4x16x4d_sse2
unsigned int aom_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x4 aom_highbd_sad_skip_4x4_c
-void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x4x4d aom_highbd_sad_skip_4x4x4d_c
unsigned int aom_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x8 aom_highbd_sad_skip_4x8_sse2
-void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x8x4d aom_highbd_sad_skip_4x8x4d_sse2
unsigned int aom_highbd_sad_skip_64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x128 aom_highbd_sad_skip_64x128_c
-void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x128x4d aom_highbd_sad_skip_64x128x4d_c
unsigned int aom_highbd_sad_skip_64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x16 aom_highbd_sad_skip_64x16_sse2
-void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x16x4d aom_highbd_sad_skip_64x16x4d_sse2
unsigned int aom_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x32 aom_highbd_sad_skip_64x32_sse2
-void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x32x4d aom_highbd_sad_skip_64x32x4d_sse2
unsigned int aom_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x64 aom_highbd_sad_skip_64x64_sse2
-void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x64x4d aom_highbd_sad_skip_64x64x4d_sse2
unsigned int aom_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x16 aom_highbd_sad_skip_8x16_sse2
-void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x16x4d aom_highbd_sad_skip_8x16x4d_sse2
unsigned int aom_highbd_sad_skip_8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x32 aom_highbd_sad_skip_8x32_sse2
-void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x32x4d aom_highbd_sad_skip_8x32x4d_sse2
unsigned int aom_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x4 aom_highbd_sad_skip_8x4_c
-void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x4x4d aom_highbd_sad_skip_8x4x4d_c
unsigned int aom_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x8 aom_highbd_sad_skip_8x8_sse2
-void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x8x4d aom_highbd_sad_skip_8x8x4d_sse2
void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -5677,12 +5650,6 @@ unsigned int aom_variance16x8_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance16x8 aom_variance16x8_sse2
-unsigned int aom_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x2 aom_variance2x2_c
-
-unsigned int aom_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x4 aom_variance2x4_c
-
unsigned int aom_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance32x16 aom_variance32x16_sse2
@@ -5703,9 +5670,6 @@ unsigned int aom_variance4x16_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x16 aom_variance4x16_sse2
-unsigned int aom_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance4x2 aom_variance4x2_c
-
unsigned int aom_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x4 aom_variance4x4_sse2
diff --git a/config/x86/config/av1_rtcd.h b/config/x86/config/av1_rtcd.h
index d05b1d509..0cd44e9bf 100644
--- a/config/x86/config/av1_rtcd.h
+++ b/config/x86/config/av1_rtcd.h
@@ -133,7 +133,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search);
#define aom_upsampled_pred aom_upsampled_pred_sse2
-void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+int av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_c
void av1_apply_temporal_filter_c(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
@@ -158,10 +158,6 @@ void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE m
void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_ssse3
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
-int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
-#define av1_calc_frame_error av1_calc_frame_error_sse2
-
void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim1 av1_calc_indices_dim1_sse2
@@ -170,28 +166,28 @@ void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint
void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim2 av1_calc_indices_dim2_sse2
-void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
#define av1_calc_proj_params av1_calc_proj_params_c
-void av1_calc_proj_params_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
#define av1_calc_proj_params_high_bd av1_calc_proj_params_high_bd_c
-void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+void av1_cnn_activate_c(float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
#define av1_cnn_activate av1_cnn_activate_c
-void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+void av1_cnn_add_c(float **input, int channels, int width, int height, int stride, const float **add);
#define av1_cnn_add av1_cnn_add_c
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
#define av1_cnn_batchnorm av1_cnn_batchnorm_c
-void av1_cnn_convolve_no_maxpool_padding_valid_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
+void av1_cnn_convolve_no_maxpool_padding_valid_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
#define av1_cnn_convolve_no_maxpool_padding_valid av1_cnn_convolve_no_maxpool_padding_valid_c
-void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
#define av1_cnn_deconvolve av1_cnn_deconvolve_c
-bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
#define av1_cnn_predict av1_cnn_predict_c
void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
@@ -207,6 +203,9 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int
void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_sr av1_convolve_2d_sr_sse2
+void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_sr_intrabc av1_convolve_2d_sr_intrabc_c
+
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
#define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
@@ -214,10 +213,16 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
#define av1_convolve_x_sr av1_convolve_x_sr_sse2
+void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
+#define av1_convolve_x_sr_intrabc av1_convolve_x_sr_intrabc_c
+
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
#define av1_convolve_y_sr av1_convolve_y_sr_sse2
+void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
+
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -250,9 +255,6 @@ double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, in
void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
#define av1_filter_intra_edge av1_filter_intra_edge_c
-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
-#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
-
void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
#define av1_filter_intra_predictor av1_filter_intra_predictor_c
@@ -319,7 +321,7 @@ void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
uint32_t av1_get_crc32c_value_c(void *crc_calculator, uint8_t *p, size_t length);
#define av1_get_crc32c_value av1_get_crc32c_value_c
-void av1_get_horver_correlation_full_c( const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
#define av1_get_horver_correlation_full av1_get_horver_correlation_full_c
void av1_get_nz_map_contexts_c(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts);
@@ -350,6 +352,9 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *
void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_ssse3
+void av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_sr_intrabc av1_highbd_convolve_2d_sr_intrabc_c
+
void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@@ -363,10 +368,16 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_ssse3
+void av1_highbd_convolve_x_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_x_sr_intrabc av1_highbd_convolve_x_sr_intrabc_c
+
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
+void av1_highbd_convolve_y_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
+#define av1_highbd_convolve_y_sr_intrabc av1_highbd_convolve_y_sr_intrabc_c
+
void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
@@ -391,6 +402,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh);
#define av1_highbd_estimate_noise_from_single_plane av1_highbd_estimate_noise_from_single_plane_c
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength);
+#define av1_highbd_filter_intra_edge av1_highbd_filter_intra_edge_c
+
void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
@@ -454,17 +468,20 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-int64_t av1_highbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_highbd_pixel_proj_error av1_highbd_pixel_proj_error_c
void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
#define av1_highbd_quantize_fp av1_highbd_quantize_fp_c
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd);
+#define av1_highbd_upsample_intra_edge av1_highbd_upsample_intra_edge_c
+
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_highbd_warp_affine av1_highbd_warp_affine_c
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
-void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_ssse3
void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
@@ -532,15 +549,15 @@ void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_s
void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
#define av1_lowbd_fwd_txfm av1_lowbd_fwd_txfm_sse2
-int64_t av1_lowbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_lowbd_pixel_proj_error av1_lowbd_pixel_proj_error_c
-void av1_nn_fast_softmax_16_c( const float *input_nodes, float *output);
-void av1_nn_fast_softmax_16_sse3( const float *input_nodes, float *output);
+void av1_nn_fast_softmax_16_c(const float *input_nodes, float *output);
+void av1_nn_fast_softmax_16_sse3(const float *input_nodes, float *output);
#define av1_nn_fast_softmax_16 av1_nn_fast_softmax_16_sse3
-void av1_nn_predict_c( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
-void av1_nn_predict_sse3( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_c(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_sse3(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
#define av1_nn_predict av1_nn_predict_sse3
void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
@@ -578,9 +595,6 @@ void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, const
void av1_upsample_intra_edge_c(uint8_t *p, int sz);
#define av1_upsample_intra_edge av1_upsample_intra_edge_c
-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
-#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
-
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_warp_affine av1_warp_affine_c
@@ -596,8 +610,8 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_sse2
-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
-void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
#define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_sse2
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
diff --git a/config/x86_64/config/aom_config.asm b/config/x86_64/config/aom_config.asm
index dc45eaabc..5da5754ab 100644
--- a/config/x86_64/config/aom_config.asm
+++ b/config/x86_64/config/aom_config.asm
@@ -66,6 +66,8 @@
%define HAVE_FEXCEPT 1
%define HAVE_MMX 1
%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
%define HAVE_PTHREAD_H 1
%define HAVE_SSE 1
%define HAVE_SSE2 1
@@ -73,6 +75,7 @@
%define HAVE_SSE4_1 0
%define HAVE_SSE4_2 0
%define HAVE_SSSE3 1
+%define HAVE_SVE 0
%define HAVE_UNISTD_H 1
%define HAVE_VSX 0
%define HAVE_WXWIDGETS 0
diff --git a/config/x86_64/config/aom_config.c b/config/x86_64/config/aom_config.c
index 8a75212e1..ee37f5b4a 100644
--- a/config/x86_64/config/aom_config.c
+++ b/config/x86_64/config/aom_config.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/config/x86_64/config/aom_config.h b/config/x86_64/config/aom_config.h
index bad78614c..765481ed9 100644
--- a/config/x86_64/config/aom_config.h
+++ b/config/x86_64/config/aom_config.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -78,6 +78,8 @@
#define HAVE_FEXCEPT 1
#define HAVE_MMX 1
#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
#define HAVE_PTHREAD_H 1
#define HAVE_SSE 1
#define HAVE_SSE2 1
@@ -85,6 +87,7 @@
#define HAVE_SSE4_1 0
#define HAVE_SSE4_2 0
#define HAVE_SSSE3 1
+#define HAVE_SVE 0
#define HAVE_UNISTD_H 1
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
diff --git a/config/x86_64/config/aom_dsp_rtcd.h b/config/x86_64/config/aom_dsp_rtcd.h
index cfb738003..518c19e53 100644
--- a/config/x86_64/config/aom_dsp_rtcd.h
+++ b/config/x86_64/config/aom_dsp_rtcd.h
@@ -1168,96 +1168,87 @@ uint32_t aom_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int sourc
uint32_t aom_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_sub_pixel_variance8x8 aom_highbd_10_sub_pixel_variance8x8_sse2
-unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x128 aom_highbd_10_variance128x128_sse2
-unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance128x64 aom_highbd_10_variance128x64_sse2
-unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x16 aom_highbd_10_variance16x16_sse2
-unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x32 aom_highbd_10_variance16x32_sse2
-unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x4 aom_highbd_10_variance16x4_c
-unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x64 aom_highbd_10_variance16x64_sse2
-unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance16x8 aom_highbd_10_variance16x8_sse2
-unsigned int aom_highbd_10_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x2 aom_highbd_10_variance2x2_c
-
-unsigned int aom_highbd_10_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance2x4 aom_highbd_10_variance2x4_c
-
-unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x16 aom_highbd_10_variance32x16_sse2
-unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x32 aom_highbd_10_variance32x32_sse2
-unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x64 aom_highbd_10_variance32x64_sse2
-unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance32x8 aom_highbd_10_variance32x8_sse2
-unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x16 aom_highbd_10_variance4x16_c
-unsigned int aom_highbd_10_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_10_variance4x2 aom_highbd_10_variance4x2_c
-
-unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x4 aom_highbd_10_variance4x4_c
-unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance4x8 aom_highbd_10_variance4x8_c
-unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x128 aom_highbd_10_variance64x128_sse2
-unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x16 aom_highbd_10_variance64x16_sse2
-unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x32 aom_highbd_10_variance64x32_sse2
-unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance64x64 aom_highbd_10_variance64x64_sse2
-unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x16 aom_highbd_10_variance8x16_sse2
-unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x32 aom_highbd_10_variance8x32_sse2
-unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x4 aom_highbd_10_variance8x4_c
-unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_10_variance8x8 aom_highbd_10_variance8x8_sse2
uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
@@ -1727,96 +1718,87 @@ uint32_t aom_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int sourc
uint32_t aom_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_sub_pixel_variance8x8 aom_highbd_12_sub_pixel_variance8x8_sse2
-unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x128 aom_highbd_12_variance128x128_sse2
-unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance128x64 aom_highbd_12_variance128x64_sse2
-unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x16 aom_highbd_12_variance16x16_sse2
-unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x32 aom_highbd_12_variance16x32_sse2
-unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x4 aom_highbd_12_variance16x4_c
-unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x64 aom_highbd_12_variance16x64_sse2
-unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance16x8 aom_highbd_12_variance16x8_sse2
-unsigned int aom_highbd_12_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x2 aom_highbd_12_variance2x2_c
-
-unsigned int aom_highbd_12_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance2x4 aom_highbd_12_variance2x4_c
-
-unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x16 aom_highbd_12_variance32x16_sse2
-unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x32 aom_highbd_12_variance32x32_sse2
-unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x64 aom_highbd_12_variance32x64_sse2
-unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance32x8 aom_highbd_12_variance32x8_sse2
-unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x16 aom_highbd_12_variance4x16_c
-unsigned int aom_highbd_12_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_12_variance4x2 aom_highbd_12_variance4x2_c
-
-unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x4 aom_highbd_12_variance4x4_c
-unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance4x8 aom_highbd_12_variance4x8_c
-unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x128 aom_highbd_12_variance64x128_sse2
-unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x16 aom_highbd_12_variance64x16_sse2
-unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x32 aom_highbd_12_variance64x32_sse2
-unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance64x64 aom_highbd_12_variance64x64_sse2
-unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x16 aom_highbd_12_variance8x16_sse2
-unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x32 aom_highbd_12_variance8x32_sse2
-unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x4 aom_highbd_12_variance8x4_c
-unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_12_variance8x8 aom_highbd_12_variance8x8_sse2
uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param);
@@ -1987,6 +1969,138 @@ unsigned int aom_highbd_8_mse8x8_c(const uint8_t *src_ptr, int source_stride, c
unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_8_mse8x8 aom_highbd_8_mse8x8_sse2
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x128 aom_highbd_8_obmc_sub_pixel_variance128x128_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance128x64 aom_highbd_8_obmc_sub_pixel_variance128x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x16 aom_highbd_8_obmc_sub_pixel_variance16x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x32 aom_highbd_8_obmc_sub_pixel_variance16x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x4 aom_highbd_8_obmc_sub_pixel_variance16x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x64 aom_highbd_8_obmc_sub_pixel_variance16x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance16x8 aom_highbd_8_obmc_sub_pixel_variance16x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x16 aom_highbd_8_obmc_sub_pixel_variance32x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x32 aom_highbd_8_obmc_sub_pixel_variance32x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x64 aom_highbd_8_obmc_sub_pixel_variance32x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance32x8 aom_highbd_8_obmc_sub_pixel_variance32x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x16 aom_highbd_8_obmc_sub_pixel_variance4x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x4 aom_highbd_8_obmc_sub_pixel_variance4x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance4x8 aom_highbd_8_obmc_sub_pixel_variance4x8_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x128 aom_highbd_8_obmc_sub_pixel_variance64x128_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x16 aom_highbd_8_obmc_sub_pixel_variance64x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x32 aom_highbd_8_obmc_sub_pixel_variance64x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance64x64 aom_highbd_8_obmc_sub_pixel_variance64x64_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x16 aom_highbd_8_obmc_sub_pixel_variance8x16_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x32 aom_highbd_8_obmc_sub_pixel_variance8x32_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x4 aom_highbd_8_obmc_sub_pixel_variance8x4_c
+
+unsigned int aom_highbd_8_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_sub_pixel_variance8x8 aom_highbd_8_obmc_sub_pixel_variance8x8_c
+
+unsigned int aom_highbd_8_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x128 aom_highbd_8_obmc_variance128x128_c
+
+unsigned int aom_highbd_8_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance128x64 aom_highbd_8_obmc_variance128x64_c
+
+unsigned int aom_highbd_8_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x16 aom_highbd_8_obmc_variance16x16_c
+
+unsigned int aom_highbd_8_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x32 aom_highbd_8_obmc_variance16x32_c
+
+unsigned int aom_highbd_8_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x4 aom_highbd_8_obmc_variance16x4_c
+
+unsigned int aom_highbd_8_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x64 aom_highbd_8_obmc_variance16x64_c
+
+unsigned int aom_highbd_8_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance16x8 aom_highbd_8_obmc_variance16x8_c
+
+unsigned int aom_highbd_8_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x16 aom_highbd_8_obmc_variance32x16_c
+
+unsigned int aom_highbd_8_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x32 aom_highbd_8_obmc_variance32x32_c
+
+unsigned int aom_highbd_8_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x64 aom_highbd_8_obmc_variance32x64_c
+
+unsigned int aom_highbd_8_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance32x8 aom_highbd_8_obmc_variance32x8_c
+
+unsigned int aom_highbd_8_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x16 aom_highbd_8_obmc_variance4x16_c
+
+unsigned int aom_highbd_8_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x4 aom_highbd_8_obmc_variance4x4_c
+
+unsigned int aom_highbd_8_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance4x8 aom_highbd_8_obmc_variance4x8_c
+
+unsigned int aom_highbd_8_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x128 aom_highbd_8_obmc_variance64x128_c
+
+unsigned int aom_highbd_8_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x16 aom_highbd_8_obmc_variance64x16_c
+
+unsigned int aom_highbd_8_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x32 aom_highbd_8_obmc_variance64x32_c
+
+unsigned int aom_highbd_8_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance64x64 aom_highbd_8_obmc_variance64x64_c
+
+unsigned int aom_highbd_8_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x16 aom_highbd_8_obmc_variance8x16_c
+
+unsigned int aom_highbd_8_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x32 aom_highbd_8_obmc_variance8x32_c
+
+unsigned int aom_highbd_8_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x4 aom_highbd_8_obmc_variance8x4_c
+
+unsigned int aom_highbd_8_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
+#define aom_highbd_8_obmc_variance8x8 aom_highbd_8_obmc_variance8x8_c
+
uint32_t aom_highbd_8_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
#define aom_highbd_8_sub_pixel_avg_variance128x128 aom_highbd_8_sub_pixel_avg_variance128x128_c
@@ -2154,96 +2268,87 @@ uint32_t aom_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source
uint32_t aom_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_sub_pixel_variance8x8 aom_highbd_8_sub_pixel_variance8x8_sse2
-unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x128 aom_highbd_8_variance128x128_sse2
-unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance128x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance128x64 aom_highbd_8_variance128x64_sse2
-unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x16 aom_highbd_8_variance16x16_sse2
-unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x32 aom_highbd_8_variance16x32_sse2
-unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x4 aom_highbd_8_variance16x4_c
-unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x64 aom_highbd_8_variance16x64_sse2
-unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance16x8 aom_highbd_8_variance16x8_sse2
-unsigned int aom_highbd_8_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x2 aom_highbd_8_variance2x2_c
-
-unsigned int aom_highbd_8_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance2x4 aom_highbd_8_variance2x4_c
-
-unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x16 aom_highbd_8_variance32x16_sse2
-unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x32 aom_highbd_8_variance32x32_sse2
-unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x64 aom_highbd_8_variance32x64_sse2
-unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance32x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance32x8 aom_highbd_8_variance32x8_sse2
-unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x16 aom_highbd_8_variance4x16_c
-unsigned int aom_highbd_8_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_highbd_8_variance4x2 aom_highbd_8_variance4x2_c
-
-unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x4 aom_highbd_8_variance4x4_c
-unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance4x8 aom_highbd_8_variance4x8_c
-unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x128_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x128 aom_highbd_8_variance64x128_sse2
-unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x16 aom_highbd_8_variance64x16_sse2
-unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x32 aom_highbd_8_variance64x32_sse2
-unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance64x64 aom_highbd_8_variance64x64_sse2
-unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x16 aom_highbd_8_variance8x16_sse2
-unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x32 aom_highbd_8_variance8x32_sse2
-unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x4 aom_highbd_8_variance8x4_c
-unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int aom_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int aom_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+unsigned int aom_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define aom_highbd_8_variance8x8 aom_highbd_8_variance8x8_sse2
unsigned int aom_highbd_avg_4x4_c(const uint8_t *, int p);
@@ -2918,138 +3023,6 @@ unsigned int aom_highbd_obmc_sad8x4_c(const uint8_t *pre, int pre_stride, const
unsigned int aom_highbd_obmc_sad8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask);
#define aom_highbd_obmc_sad8x8 aom_highbd_obmc_sad8x8_c
-unsigned int aom_highbd_obmc_sub_pixel_variance128x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x128 aom_highbd_obmc_sub_pixel_variance128x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance128x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance128x64 aom_highbd_obmc_sub_pixel_variance128x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x16 aom_highbd_obmc_sub_pixel_variance16x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x32 aom_highbd_obmc_sub_pixel_variance16x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x4 aom_highbd_obmc_sub_pixel_variance16x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x64 aom_highbd_obmc_sub_pixel_variance16x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance16x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance16x8 aom_highbd_obmc_sub_pixel_variance16x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x16 aom_highbd_obmc_sub_pixel_variance32x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x32 aom_highbd_obmc_sub_pixel_variance32x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x64 aom_highbd_obmc_sub_pixel_variance32x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance32x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance32x8 aom_highbd_obmc_sub_pixel_variance32x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x16 aom_highbd_obmc_sub_pixel_variance4x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x4 aom_highbd_obmc_sub_pixel_variance4x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance4x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance4x8 aom_highbd_obmc_sub_pixel_variance4x8_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x128_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x128 aom_highbd_obmc_sub_pixel_variance64x128_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x16 aom_highbd_obmc_sub_pixel_variance64x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x32 aom_highbd_obmc_sub_pixel_variance64x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance64x64_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance64x64 aom_highbd_obmc_sub_pixel_variance64x64_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x16_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x16 aom_highbd_obmc_sub_pixel_variance8x16_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x32_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x32 aom_highbd_obmc_sub_pixel_variance8x32_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x4_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x4 aom_highbd_obmc_sub_pixel_variance8x4_c
-
-unsigned int aom_highbd_obmc_sub_pixel_variance8x8_c(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_sub_pixel_variance8x8 aom_highbd_obmc_sub_pixel_variance8x8_c
-
-unsigned int aom_highbd_obmc_variance128x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x128 aom_highbd_obmc_variance128x128_c
-
-unsigned int aom_highbd_obmc_variance128x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance128x64 aom_highbd_obmc_variance128x64_c
-
-unsigned int aom_highbd_obmc_variance16x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x16 aom_highbd_obmc_variance16x16_c
-
-unsigned int aom_highbd_obmc_variance16x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x32 aom_highbd_obmc_variance16x32_c
-
-unsigned int aom_highbd_obmc_variance16x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x4 aom_highbd_obmc_variance16x4_c
-
-unsigned int aom_highbd_obmc_variance16x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x64 aom_highbd_obmc_variance16x64_c
-
-unsigned int aom_highbd_obmc_variance16x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance16x8 aom_highbd_obmc_variance16x8_c
-
-unsigned int aom_highbd_obmc_variance32x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x16 aom_highbd_obmc_variance32x16_c
-
-unsigned int aom_highbd_obmc_variance32x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x32 aom_highbd_obmc_variance32x32_c
-
-unsigned int aom_highbd_obmc_variance32x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x64 aom_highbd_obmc_variance32x64_c
-
-unsigned int aom_highbd_obmc_variance32x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance32x8 aom_highbd_obmc_variance32x8_c
-
-unsigned int aom_highbd_obmc_variance4x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x16 aom_highbd_obmc_variance4x16_c
-
-unsigned int aom_highbd_obmc_variance4x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x4 aom_highbd_obmc_variance4x4_c
-
-unsigned int aom_highbd_obmc_variance4x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance4x8 aom_highbd_obmc_variance4x8_c
-
-unsigned int aom_highbd_obmc_variance64x128_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x128 aom_highbd_obmc_variance64x128_c
-
-unsigned int aom_highbd_obmc_variance64x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x16 aom_highbd_obmc_variance64x16_c
-
-unsigned int aom_highbd_obmc_variance64x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x32 aom_highbd_obmc_variance64x32_c
-
-unsigned int aom_highbd_obmc_variance64x64_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance64x64 aom_highbd_obmc_variance64x64_c
-
-unsigned int aom_highbd_obmc_variance8x16_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x16 aom_highbd_obmc_variance8x16_c
-
-unsigned int aom_highbd_obmc_variance8x32_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x32 aom_highbd_obmc_variance8x32_c
-
-unsigned int aom_highbd_obmc_variance8x4_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x4 aom_highbd_obmc_variance8x4_c
-
-unsigned int aom_highbd_obmc_variance8x8_c(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);
-#define aom_highbd_obmc_variance8x8 aom_highbd_obmc_variance8x8_c
-
void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
@@ -3137,10 +3110,10 @@ unsigned int aom_highbd_sad128x128_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad128x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad128x128_avg aom_highbd_sad128x128_avg_c
-void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x3d aom_highbd_sad128x128x3d_c
-void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x128x4d aom_highbd_sad128x128x4d_c
unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3149,10 +3122,10 @@ unsigned int aom_highbd_sad128x64_c(const uint8_t *src_ptr, int src_stride, cons
unsigned int aom_highbd_sad128x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad128x64_avg aom_highbd_sad128x64_avg_c
-void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x3d aom_highbd_sad128x64x3d_c
-void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad128x64x4d aom_highbd_sad128x64x4d_c
unsigned int aom_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3163,11 +3136,11 @@ unsigned int aom_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x16_avg aom_highbd_sad16x16_avg_sse2
-void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x3d aom_highbd_sad16x16x3d_c
-void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x16x4d aom_highbd_sad16x16x4d_sse2
unsigned int aom_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3178,11 +3151,11 @@ unsigned int aom_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x32_avg aom_highbd_sad16x32_avg_sse2
-void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x3d aom_highbd_sad16x32x3d_c
-void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x32x4d aom_highbd_sad16x32x4d_sse2
unsigned int aom_highbd_sad16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3193,11 +3166,11 @@ unsigned int aom_highbd_sad16x4_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x4_avg aom_highbd_sad16x4_avg_sse2
-void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x3d aom_highbd_sad16x4x3d_c
-void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x4x4d aom_highbd_sad16x4x4d_sse2
unsigned int aom_highbd_sad16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3208,11 +3181,11 @@ unsigned int aom_highbd_sad16x64_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad16x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x64_avg aom_highbd_sad16x64_avg_sse2
-void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x3d aom_highbd_sad16x64x3d_c
-void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x64x4d aom_highbd_sad16x64x4d_sse2
unsigned int aom_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3223,11 +3196,11 @@ unsigned int aom_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad16x8_avg aom_highbd_sad16x8_avg_sse2
-void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x3d aom_highbd_sad16x8x3d_c
-void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad16x8x4d aom_highbd_sad16x8x4d_sse2
unsigned int aom_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3238,11 +3211,11 @@ unsigned int aom_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x16_avg aom_highbd_sad32x16_avg_sse2
-void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x3d aom_highbd_sad32x16x3d_c
-void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x16x4d aom_highbd_sad32x16x4d_sse2
unsigned int aom_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3253,11 +3226,11 @@ unsigned int aom_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x32_avg aom_highbd_sad32x32_avg_sse2
-void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x3d aom_highbd_sad32x32x3d_c
-void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x32x4d aom_highbd_sad32x32x4d_sse2
unsigned int aom_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3268,11 +3241,11 @@ unsigned int aom_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x64_avg aom_highbd_sad32x64_avg_sse2
-void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x3d aom_highbd_sad32x64x3d_c
-void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x64x4d aom_highbd_sad32x64x4d_sse2
unsigned int aom_highbd_sad32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3283,11 +3256,11 @@ unsigned int aom_highbd_sad32x8_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad32x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad32x8_avg aom_highbd_sad32x8_avg_sse2
-void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x3d aom_highbd_sad32x8x3d_c
-void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad32x8x4d aom_highbd_sad32x8x4d_sse2
unsigned int aom_highbd_sad4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3298,11 +3271,11 @@ unsigned int aom_highbd_sad4x16_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad4x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x16_avg aom_highbd_sad4x16_avg_sse2
-void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x3d aom_highbd_sad4x16x3d_c
-void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x16x4d aom_highbd_sad4x16x4d_sse2
unsigned int aom_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3313,11 +3286,11 @@ unsigned int aom_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x4_avg aom_highbd_sad4x4_avg_sse2
-void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x3d aom_highbd_sad4x4x3d_c
-void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x4x4d aom_highbd_sad4x4x4d_sse2
unsigned int aom_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3328,11 +3301,11 @@ unsigned int aom_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad4x8_avg aom_highbd_sad4x8_avg_sse2
-void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x3d aom_highbd_sad4x8x3d_c
-void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad4x8x4d aom_highbd_sad4x8x4d_sse2
unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3341,10 +3314,10 @@ unsigned int aom_highbd_sad64x128_c(const uint8_t *src_ptr, int src_stride, cons
unsigned int aom_highbd_sad64x128_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x128_avg aom_highbd_sad64x128_avg_c
-void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x3d aom_highbd_sad64x128x3d_c
-void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x128x4d aom_highbd_sad64x128x4d_c
unsigned int aom_highbd_sad64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3355,11 +3328,11 @@ unsigned int aom_highbd_sad64x16_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad64x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x16_avg aom_highbd_sad64x16_avg_sse2
-void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x3d aom_highbd_sad64x16x3d_c
-void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x16x4d aom_highbd_sad64x16x4d_sse2
unsigned int aom_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3370,11 +3343,11 @@ unsigned int aom_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x32_avg aom_highbd_sad64x32_avg_sse2
-void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x3d aom_highbd_sad64x32x3d_c
-void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x32x4d aom_highbd_sad64x32x4d_sse2
unsigned int aom_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3385,11 +3358,11 @@ unsigned int aom_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
unsigned int aom_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad64x64_avg aom_highbd_sad64x64_avg_sse2
-void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x3d aom_highbd_sad64x64x3d_c
-void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad64x64x4d aom_highbd_sad64x64x4d_sse2
unsigned int aom_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3400,11 +3373,11 @@ unsigned int aom_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x16_avg aom_highbd_sad8x16_avg_sse2
-void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x3d aom_highbd_sad8x16x3d_c
-void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x16x4d aom_highbd_sad8x16x4d_sse2
unsigned int aom_highbd_sad8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3415,11 +3388,11 @@ unsigned int aom_highbd_sad8x32_avg_c(const uint8_t *src_ptr, int src_stride, co
unsigned int aom_highbd_sad8x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x32_avg aom_highbd_sad8x32_avg_sse2
-void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x3d aom_highbd_sad8x32x3d_c
-void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x32x4d aom_highbd_sad8x32x4d_sse2
unsigned int aom_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3430,11 +3403,11 @@ unsigned int aom_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x4_avg aom_highbd_sad8x4_avg_sse2
-void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x3d aom_highbd_sad8x4x3d_c
-void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x4x4d aom_highbd_sad8x4x4d_sse2
unsigned int aom_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -3445,175 +3418,175 @@ unsigned int aom_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
unsigned int aom_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
#define aom_highbd_sad8x8_avg aom_highbd_sad8x8_avg_sse2
-void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x3d aom_highbd_sad8x8x3d_c
-void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad8x8x4d aom_highbd_sad8x8x4d_sse2
unsigned int aom_highbd_sad_skip_128x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x128 aom_highbd_sad_skip_128x128_c
-void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x128x4d aom_highbd_sad_skip_128x128x4d_c
unsigned int aom_highbd_sad_skip_128x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_128x64 aom_highbd_sad_skip_128x64_c
-void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_128x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_128x64x4d aom_highbd_sad_skip_128x64x4d_c
unsigned int aom_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x16 aom_highbd_sad_skip_16x16_sse2
-void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x16x4d aom_highbd_sad_skip_16x16x4d_sse2
unsigned int aom_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x32 aom_highbd_sad_skip_16x32_sse2
-void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x32x4d aom_highbd_sad_skip_16x32x4d_sse2
unsigned int aom_highbd_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x4 aom_highbd_sad_skip_16x4_c
-void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x4x4d aom_highbd_sad_skip_16x4x4d_c
unsigned int aom_highbd_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x64 aom_highbd_sad_skip_16x64_sse2
-void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x64x4d aom_highbd_sad_skip_16x64x4d_sse2
unsigned int aom_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_16x8 aom_highbd_sad_skip_16x8_sse2
-void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_16x8x4d aom_highbd_sad_skip_16x8x4d_sse2
unsigned int aom_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x16 aom_highbd_sad_skip_32x16_sse2
-void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x16x4d aom_highbd_sad_skip_32x16x4d_sse2
unsigned int aom_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x32 aom_highbd_sad_skip_32x32_sse2
-void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x32x4d aom_highbd_sad_skip_32x32x4d_sse2
unsigned int aom_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x64 aom_highbd_sad_skip_32x64_sse2
-void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x64x4d aom_highbd_sad_skip_32x64x4d_sse2
unsigned int aom_highbd_sad_skip_32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_32x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_32x8 aom_highbd_sad_skip_32x8_sse2
-void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_32x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_32x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_32x8x4d aom_highbd_sad_skip_32x8x4d_sse2
unsigned int aom_highbd_sad_skip_4x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x16 aom_highbd_sad_skip_4x16_sse2
-void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x16x4d aom_highbd_sad_skip_4x16x4d_sse2
unsigned int aom_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x4 aom_highbd_sad_skip_4x4_c
-void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x4x4d aom_highbd_sad_skip_4x4x4d_c
unsigned int aom_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_4x8 aom_highbd_sad_skip_4x8_sse2
-void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_4x8x4d aom_highbd_sad_skip_4x8x4d_sse2
unsigned int aom_highbd_sad_skip_64x128_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x128 aom_highbd_sad_skip_64x128_c
-void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x128x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x128x4d aom_highbd_sad_skip_64x128x4d_c
unsigned int aom_highbd_sad_skip_64x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x16 aom_highbd_sad_skip_64x16_sse2
-void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x16x4d aom_highbd_sad_skip_64x16x4d_sse2
unsigned int aom_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x32 aom_highbd_sad_skip_64x32_sse2
-void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x32x4d aom_highbd_sad_skip_64x32x4d_sse2
unsigned int aom_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_64x64 aom_highbd_sad_skip_64x64_sse2
-void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_64x64x4d aom_highbd_sad_skip_64x64x4d_sse2
unsigned int aom_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x16 aom_highbd_sad_skip_8x16_sse2
-void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x16x4d aom_highbd_sad_skip_8x16x4d_sse2
unsigned int aom_highbd_sad_skip_8x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x32 aom_highbd_sad_skip_8x32_sse2
-void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x32x4d aom_highbd_sad_skip_8x32x4d_sse2
unsigned int aom_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x4 aom_highbd_sad_skip_8x4_c
-void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x4x4d aom_highbd_sad_skip_8x4x4d_c
unsigned int aom_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_highbd_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
#define aom_highbd_sad_skip_8x8 aom_highbd_sad_skip_8x8_sse2
-void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void aom_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void aom_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+void aom_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
#define aom_highbd_sad_skip_8x8x4d aom_highbd_sad_skip_8x8x4d_sse2
void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -5680,12 +5653,6 @@ unsigned int aom_variance16x8_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance16x8 aom_variance16x8_sse2
-unsigned int aom_variance2x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x2 aom_variance2x2_c
-
-unsigned int aom_variance2x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance2x4 aom_variance2x4_c
-
unsigned int aom_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance32x16 aom_variance32x16_sse2
@@ -5706,9 +5673,6 @@ unsigned int aom_variance4x16_c(const uint8_t *src_ptr, int source_stride, const
unsigned int aom_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x16 aom_variance4x16_sse2
-unsigned int aom_variance4x2_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define aom_variance4x2 aom_variance4x2_c
-
unsigned int aom_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int aom_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define aom_variance4x4 aom_variance4x4_sse2
diff --git a/config/x86_64/config/av1_rtcd.h b/config/x86_64/config/av1_rtcd.h
index c64a024a1..044bc1937 100644
--- a/config/x86_64/config/av1_rtcd.h
+++ b/config/x86_64/config/av1_rtcd.h
@@ -133,7 +133,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search);
#define aom_upsampled_pred aom_upsampled_pred_sse2
-void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+int av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_c
void av1_apply_temporal_filter_c(const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
@@ -158,10 +158,6 @@ void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE m
void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_ssse3
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
-int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
-#define av1_calc_frame_error av1_calc_frame_error_sse2
-
void av1_calc_indices_dim1_c(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim1 av1_calc_indices_dim1_sse2
@@ -170,28 +166,28 @@ void av1_calc_indices_dim2_c(const int16_t *data, const int16_t *centroids, uint
void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k);
#define av1_calc_indices_dim2 av1_calc_indices_dim2_sse2
-void av1_calc_proj_params_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
#define av1_calc_proj_params av1_calc_proj_params_c
-void av1_calc_proj_params_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params);
#define av1_calc_proj_params_high_bd av1_calc_proj_params_high_bd_c
-void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+void av1_cnn_activate_c(float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
#define av1_cnn_activate av1_cnn_activate_c
-void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+void av1_cnn_add_c(float **input, int channels, int width, int height, int stride, const float **add);
#define av1_cnn_add av1_cnn_add_c
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
#define av1_cnn_batchnorm av1_cnn_batchnorm_c
-void av1_cnn_convolve_no_maxpool_padding_valid_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
+void av1_cnn_convolve_no_maxpool_padding_valid_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step);
#define av1_cnn_convolve_no_maxpool_padding_valid av1_cnn_convolve_no_maxpool_padding_valid_c
-void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
#define av1_cnn_deconvolve av1_cnn_deconvolve_c
-bool av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
#define av1_cnn_predict av1_cnn_predict_c
void av1_compute_stats_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats);
@@ -207,6 +203,9 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int
void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_sr av1_convolve_2d_sr_sse2
+void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_sr_intrabc av1_convolve_2d_sr_intrabc_c
+
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
#define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
@@ -214,10 +213,16 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
#define av1_convolve_x_sr av1_convolve_x_sr_sse2
+void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params);
+#define av1_convolve_x_sr_intrabc av1_convolve_x_sr_intrabc_c
+
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
#define av1_convolve_y_sr av1_convolve_y_sr_sse2
+void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
+
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -250,9 +255,6 @@ double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, in
void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
#define av1_filter_intra_edge av1_filter_intra_edge_c
-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
-#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
-
void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
#define av1_filter_intra_predictor av1_filter_intra_predictor_c
@@ -319,7 +321,7 @@ void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
uint32_t av1_get_crc32c_value_c(void *crc_calculator, uint8_t *p, size_t length);
#define av1_get_crc32c_value av1_get_crc32c_value_c
-void av1_get_horver_correlation_full_c( const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr);
#define av1_get_horver_correlation_full av1_get_horver_correlation_full_c
void av1_get_nz_map_contexts_c(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts);
@@ -353,6 +355,9 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *
void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_ssse3
+void av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_sr_intrabc av1_highbd_convolve_2d_sr_intrabc_c
+
void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@@ -366,10 +371,16 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_ssse3
+void av1_highbd_convolve_x_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_x_sr_intrabc av1_highbd_convolve_x_sr_intrabc_c
+
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
+void av1_highbd_convolve_y_sr_intrabc_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd);
+#define av1_highbd_convolve_y_sr_intrabc av1_highbd_convolve_y_sr_intrabc_c
+
void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
@@ -394,6 +405,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh);
#define av1_highbd_estimate_noise_from_single_plane av1_highbd_estimate_noise_from_single_plane_c
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength);
+#define av1_highbd_filter_intra_edge av1_highbd_filter_intra_edge_c
+
void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
@@ -457,17 +471,20 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-int64_t av1_highbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_highbd_pixel_proj_error av1_highbd_pixel_proj_error_c
void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
#define av1_highbd_quantize_fp av1_highbd_quantize_fp_c
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd);
+#define av1_highbd_upsample_intra_edge av1_highbd_upsample_intra_edge_c
+
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_highbd_warp_affine av1_highbd_warp_affine_c
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
-void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd);
#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_ssse3
void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
@@ -535,15 +552,15 @@ void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_s
void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
#define av1_lowbd_fwd_txfm av1_lowbd_fwd_txfm_sse2
-int64_t av1_lowbd_pixel_proj_error_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
#define av1_lowbd_pixel_proj_error av1_lowbd_pixel_proj_error_c
-void av1_nn_fast_softmax_16_c( const float *input_nodes, float *output);
-void av1_nn_fast_softmax_16_sse3( const float *input_nodes, float *output);
+void av1_nn_fast_softmax_16_c(const float *input_nodes, float *output);
+void av1_nn_fast_softmax_16_sse3(const float *input_nodes, float *output);
#define av1_nn_fast_softmax_16 av1_nn_fast_softmax_16_sse3
-void av1_nn_predict_c( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
-void av1_nn_predict_sse3( const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_c(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
+void av1_nn_predict_sse3(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output);
#define av1_nn_predict av1_nn_predict_sse3
void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
@@ -581,9 +598,6 @@ void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, const
void av1_upsample_intra_edge_c(uint8_t *p, int sz);
#define av1_upsample_intra_edge av1_upsample_intra_edge_c
-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
-#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
-
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_warp_affine av1_warp_affine_c
@@ -599,8 +613,8 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
#define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_sse2
-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
-void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
#define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_sse2
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
diff --git a/docs.cmake b/docs.cmake
index 0d8db9283..0d7b4cfde 100644
--- a/docs.cmake
+++ b/docs.cmake
@@ -223,7 +223,7 @@ function(setup_documentation_targets)
list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources)
list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs)
if(NOT ${num_sources} EQUAL ${num_descs})
- message(FATAL_ERROR "Unqeual example and description totals.")
+ message(FATAL_ERROR "Unequal example and description totals.")
endif()
# Take the list of examples and produce example_basename.dox for each file in
diff --git a/examples/lightfield_tile_list_decoder.c b/examples/lightfield_tile_list_decoder.c
index 5b15ae00e..d71ff5b38 100644
--- a/examples/lightfield_tile_list_decoder.c
+++ b/examples/lightfield_tile_list_decoder.c
@@ -170,7 +170,7 @@ int main(int argc, char **argv) {
if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
frame_res[0], frame_res[1], 32, 8,
border)) {
- die("Failed to allocate references.");
+ fatal("Failed to allocate references.");
}
}
}
diff --git a/examples/resize_util.c b/examples/resize_util.c
deleted file mode 100644
index 45a1db202..000000000
--- a/examples/resize_util.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <limits.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "av1/common/resize.h"
-#include "common/tools_common.h"
-
-static const char *exec_name = NULL;
-
-static void usage() {
- printf("Usage:\n");
- printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
- exec_name);
- printf("<output_yuv> [<frames>]\n");
-}
-
-void usage_exit(void) {
- usage();
- exit(EXIT_FAILURE);
-}
-
-static int parse_dim(char *v, int *width, int *height) {
- char *x = strchr(v, 'x');
- if (x == NULL) x = strchr(v, 'X');
- if (x == NULL) return 0;
- *width = atoi(v);
- *height = atoi(&x[1]);
- if (*width <= 0 || *height <= 0)
- return 0;
- else
- return 1;
-}
-
-int main(int argc, char *argv[]) {
- char *fin, *fout;
- FILE *fpin, *fpout;
- uint8_t *inbuf, *outbuf;
- uint8_t *inbuf_u, *outbuf_u;
- uint8_t *inbuf_v, *outbuf_v;
- int f, frames;
- int width, height, target_width, target_height;
- int failed = 0;
-
- exec_name = argv[0];
-
- if (argc < 5) {
- printf("Incorrect parameters:\n");
- usage();
- return 1;
- }
-
- fin = argv[1];
- fout = argv[4];
- if (!parse_dim(argv[2], &width, &height)) {
- printf("Incorrect parameters: %s\n", argv[2]);
- usage();
- return 1;
- }
- if (!parse_dim(argv[3], &target_width, &target_height)) {
- printf("Incorrect parameters: %s\n", argv[3]);
- usage();
- return 1;
- }
-
- fpin = fopen(fin, "rb");
- if (fpin == NULL) {
- printf("Can't open file %s to read\n", fin);
- usage();
- return 1;
- }
- fpout = fopen(fout, "wb");
- if (fpout == NULL) {
- fclose(fpin);
- printf("Can't open file %s to write\n", fout);
- usage();
- return 1;
- }
- if (argc >= 6)
- frames = atoi(argv[5]);
- else
- frames = INT_MAX;
-
- printf("Input size: %dx%d\n", width, height);
- printf("Target size: %dx%d, Frames: ", target_width, target_height);
- if (frames == INT_MAX)
- printf("All\n");
- else
- printf("%d\n", frames);
-
- inbuf = (uint8_t *)malloc(width * height * 3 / 2);
- outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
- if (!(inbuf && outbuf)) {
- printf("Failed to allocate buffers.\n");
- failed = 1;
- goto Error;
- }
- inbuf_u = inbuf + width * height;
- inbuf_v = inbuf_u + width * height / 4;
- outbuf_u = outbuf + target_width * target_height;
- outbuf_v = outbuf_u + target_width * target_height / 4;
- f = 0;
- while (f < frames) {
- if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break;
- av1_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
- width, outbuf, target_width, outbuf_u, outbuf_v,
- target_width / 2, target_height, target_width);
- fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
- f++;
- }
- printf("%d frames processed\n", f);
-Error:
- fclose(fpin);
- fclose(fpout);
-
- free(inbuf);
- free(outbuf);
- return failed;
-}
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc
index 1730f898a..c37df7918 100644
--- a/examples/svc_encoder_rtc.cc
+++ b/examples/svc_encoder_rtc.cc
@@ -18,6 +18,8 @@
#include <stdlib.h>
#include <string.h>
+#include <memory>
+
#include "config/aom_config.h"
#if CONFIG_AV1_DECODER
@@ -30,6 +32,7 @@
#include "common/video_writer.h"
#include "examples/encoder_util.h"
#include "aom_ports/aom_timer.h"
+#include "av1/ratectrl_rtc.h"
#define OPTION_BUFFER_SIZE 1024
@@ -44,6 +47,7 @@ typedef struct {
int decode;
int tune_content;
int show_psnr;
+ bool use_external_rc;
} AppInput;
typedef enum {
@@ -99,6 +103,8 @@ static const arg_def_t test_decode_arg =
"Attempt to test decoding the output when set to 1. Default is 1.");
static const arg_def_t psnr_arg =
ARG_DEF(NULL, "psnr", -1, "Show PSNR in status line.");
+static const arg_def_t ext_rc_arg =
+ ARG_DEF(NULL, "use-ext-rc", 0, "Use external rate control.");
static const struct arg_enum_list tune_content_enum[] = {
{ "default", AOM_CONTENT_DEFAULT },
{ "screen", AOM_CONTENT_SCREEN },
@@ -372,6 +378,8 @@ static void parse_command_line(int argc, const char **argv_,
printf("tune content %d\n", app_input->tune_content);
} else if (arg_match(&arg, &psnr_arg, argi)) {
app_input->show_psnr = 1;
+ } else if (arg_match(&arg, &ext_rc_arg, argi)) {
+ app_input->use_external_rc = true;
} else {
++argj;
}
@@ -429,10 +437,12 @@ static void parse_command_line(int argc, const char **argv_,
enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
}
-static int mode_to_num_temporal_layers[11] = {
- 1, 2, 3, 3, 2, 1, 1, 3, 3, 3, 3
+static int mode_to_num_temporal_layers[12] = {
+ 1, 2, 3, 3, 2, 1, 1, 3, 3, 3, 3, 3,
+};
+static int mode_to_num_spatial_layers[12] = {
+ 1, 1, 1, 1, 1, 2, 3, 2, 3, 3, 3, 3,
};
-static int mode_to_num_spatial_layers[11] = { 1, 1, 1, 1, 1, 2, 3, 2, 3, 3, 3 };
// For rate control encoding stats.
struct RateControlMetrics {
@@ -607,6 +617,7 @@ static void set_layer_pattern(
int i;
int enable_longterm_temporal_ref = 1;
int shift = (layering_mode == 8) ? 2 : 0;
+ int simulcast_mode = (layering_mode == 11);
*use_svc_control = 1;
layer_id->spatial_layer_id = spatial_layer_id;
int lag_index = 0;
@@ -1102,7 +1113,173 @@ static void set_layer_pattern(
ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 4;
}
}
- if (layer_id->spatial_layer_id > 0) {
+ break;
+ case 11:
+ // Simulcast mode for 3 spatial and 3 temporal layers.
+ // No inter-layer predicton, only prediction is temporal and single
+ // reference (LAST).
+ // No overlap in buffer slots between spatial layers. So for example,
+ // SL0 only uses slots 0 and 1.
+ // SL1 only uses slots 2 and 3.
+ // SL2 only uses slots 4 and 5.
+ // All 7 references for each inter-frame must only access buffer slots
+ // for that spatial layer.
+ // On key (super)frames: SL1 and SL2 must have no references set
+ // and must refresh all the slots for that layer only (so 2 and 3
+ // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally
+ // as a Key frame (refresh all slots). SL1/SL2 will be labelled
+ // internally as Intra-only frames that allow that stream to be decoded.
+ // These conditions will allow for each spatial stream to be
+ // independently decodeable.
+
+ // Initialize all references to 0 (don't use reference).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->reference[i] = 0;
+ // Initialize as no refresh/update for all slots.
+ for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+
+ if (is_key_frame) {
+ if (layer_id->spatial_layer_id == 0) {
+ // Assign LAST/GOLDEN to slot 0/1.
+ // Refesh slots 0 and 1 for SL0.
+ // SL0: this will get set to KEY frame internally.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 1;
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Assign LAST/GOLDEN to slot 2/3.
+ // Refesh slots 2 and 3 for SL1.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ ref_frame_config->refresh[2] = 1;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Assign LAST/GOLDEN to slot 4/5.
+ // Refresh slots 4 and 5 for SL2.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5;
+ ref_frame_config->refresh[4] = 1;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if (superframe_cnt % 4 == 0) {
+ // Base temporal layer: TL0
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST. Assign all references to either slot
+ // 0 or 1. Here we assign LAST to slot 0, all others to 1.
+ // Update slot 0 (LAST).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST. Assign all references to either slot
+ // 2 or 3. Here we assign LAST to slot 2, all others to 3.
+ // Update slot 2 (LAST).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->refresh[2] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST. Assign all references to either slot
+ // 4 or 5. Here we assign LAST to slot 4, all others to 5.
+ // Update slot 4 (LAST).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ ref_frame_config->refresh[4] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0). Assign other references to slot 1.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2). Assign other references to slot 3.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer: TL1
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 1 and update slot 1.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4).
+ // Set GOLDEN to slot 5 and update slot 5.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 1). Assign other references to slot 0.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 3). Assign other references to slot 2.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 2;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 3;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 5). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5;
+ }
+ }
+ if (!simulcast_mode && layer_id->spatial_layer_id > 0) {
// Always reference GOLDEN (inter-layer prediction).
ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
if (ksvc_mode) {
@@ -1120,8 +1297,8 @@ static void set_layer_pattern(
// allow for top spatial layer to use additional temporal reference.
// Additional reference is only updated on base temporal layer, every
// 10 TL0 frames here.
- if (enable_longterm_temporal_ref && layer_id->spatial_layer_id == 2 &&
- layering_mode == 8) {
+ if (!simulcast_mode && enable_longterm_temporal_ref &&
+ layer_id->spatial_layer_id == 2 && layering_mode == 8) {
ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
if (!is_key_frame) ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
@@ -1220,6 +1397,51 @@ static void show_psnr(struct psnr_stats *psnr_stream, double peak) {
fprintf(stderr, "\n");
}
+static aom::AV1RateControlRtcConfig create_rtc_rc_config(
+ const aom_codec_enc_cfg_t &cfg, const AppInput &app_input) {
+ aom::AV1RateControlRtcConfig rc_cfg;
+ rc_cfg.width = cfg.g_w;
+ rc_cfg.height = cfg.g_h;
+ rc_cfg.max_quantizer = cfg.rc_max_quantizer;
+ rc_cfg.min_quantizer = cfg.rc_min_quantizer;
+ rc_cfg.target_bandwidth = cfg.rc_target_bitrate;
+ rc_cfg.buf_initial_sz = cfg.rc_buf_initial_sz;
+ rc_cfg.buf_optimal_sz = cfg.rc_buf_optimal_sz;
+ rc_cfg.buf_sz = cfg.rc_buf_sz;
+ rc_cfg.overshoot_pct = cfg.rc_overshoot_pct;
+ rc_cfg.undershoot_pct = cfg.rc_undershoot_pct;
+ // This is hardcoded as AOME_SET_MAX_INTRA_BITRATE_PCT
+ rc_cfg.max_intra_bitrate_pct = 300;
+ rc_cfg.framerate = cfg.g_timebase.den;
+ // TODO(jianj): Add suppor for SVC.
+ rc_cfg.ss_number_layers = 1;
+ rc_cfg.ts_number_layers = 1;
+ rc_cfg.scaling_factor_num[0] = 1;
+ rc_cfg.scaling_factor_den[0] = 1;
+ rc_cfg.layer_target_bitrate[0] = static_cast<int>(rc_cfg.target_bandwidth);
+ rc_cfg.max_quantizers[0] = rc_cfg.max_quantizer;
+ rc_cfg.min_quantizers[0] = rc_cfg.min_quantizer;
+ rc_cfg.aq_mode = app_input.aq_mode;
+
+ return rc_cfg;
+}
+
+static int qindex_to_quantizer(int qindex) {
+ // Table that converts 0-63 range Q values passed in outside to the 0-255
+ // range Qindex used internally.
+ static const int quantizer_to_qindex[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48,
+ 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100,
+ 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+ 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+ 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+ };
+ for (int quantizer = 0; quantizer < 64; ++quantizer)
+ if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+ return 63;
+}
+
int main(int argc, const char **argv) {
AppInput app_input;
AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
@@ -1447,6 +1669,12 @@ int main(int argc, const char **argv) {
aom_codec_control(&codec, AV1E_SET_ENABLE_INTRABC, 0);
}
+ if (app_input.use_external_rc) {
+ aom_codec_control(&codec, AV1E_SET_RTC_EXTERNAL_RC, 1);
+ }
+
+ aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, INT_MAX);
+
svc_params.number_spatial_layers = ss_number_layers;
svc_params.number_temporal_layers = ts_number_layers;
for (i = 0; i < ss_number_layers * ts_number_layers; ++i) {
@@ -1483,6 +1711,13 @@ int main(int argc, const char **argv) {
frame_cnt_layer[lx] = 0;
}
+ std::unique_ptr<aom::AV1RateControlRTC> rc_api;
+ if (app_input.use_external_rc) {
+ const aom::AV1RateControlRtcConfig rc_cfg =
+ create_rtc_rc_config(cfg, app_input);
+ rc_api = aom::AV1RateControlRTC::Create(rc_cfg);
+ }
+
frame_avail = 1;
struct psnr_stats psnr_stream;
memset(&psnr_stream, 0, sizeof(psnr_stream));
@@ -1621,6 +1856,21 @@ int main(int argc, const char **argv) {
die_codec(&codec, "Failed to SET_BITRATE_ONE_PASS_CBR");
}
+ if (rc_api) {
+ aom::AV1FrameParamsRTC frame_params;
+ // TODO(jianj): Add support for SVC.
+ frame_params.spatial_layer_id = 0;
+ frame_params.temporal_layer_id = 0;
+ frame_params.frame_type =
+ is_key_frame ? aom::kKeyFrame : aom::kInterFrame;
+ rc_api->ComputeQP(frame_params);
+ const int current_qp = rc_api->GetQP();
+ if (aom_codec_control(&codec, AV1E_SET_QUANTIZER_ONE_PASS,
+ qindex_to_quantizer(current_qp))) {
+ die_codec(&codec, "Failed to SET_QUANTIZER_ONE_PASS");
+ }
+ }
+
// Do the layer encode.
aom_usec_timer_start(&timer);
if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags))
@@ -1631,10 +1881,14 @@ int main(int argc, const char **argv) {
frame_cnt_layer[layer] += 1;
got_data = 0;
+ // For simulcast (mode 11): write out each spatial layer to the file.
+ int ss_layers_write = (app_input.layering_mode == 11)
+ ? layer_id.spatial_layer_id + 1
+ : ss_number_layers;
while ((pkt = aom_codec_get_cx_data(&codec, &iter))) {
switch (pkt->kind) {
case AOM_CODEC_CX_FRAME_PKT:
- for (int sl = layer_id.spatial_layer_id; sl < ss_number_layers;
+ for (int sl = layer_id.spatial_layer_id; sl < ss_layers_write;
++sl) {
for (int tl = layer_id.temporal_layer_id; tl < ts_number_layers;
++tl) {
@@ -1675,6 +1929,9 @@ int main(int argc, const char **argv) {
if (slx == 0) ++rc.layer_enc_frames[layer_id.temporal_layer_id];
}
+ if (rc_api) {
+ rc_api->PostEncodeUpdate(pkt->data.frame.sz);
+ }
// Update for short-time encoding bitrate states, for moving window
// of size rc->window, shifted by rc->window / 2.
// Ignore first window segment, due to key frame.
diff --git a/generate_config.sh b/generate_config.sh
index e4a3b6cce..f6bd049c5 100755
--- a/generate_config.sh
+++ b/generate_config.sh
@@ -24,10 +24,9 @@
# Toolchain for riscv64:
# - gcc-riscv64-linux-gnu
# - g++-riscv64-linux-gnu
-# 32bit build environment for cmake. Including but potentially not limited to:
-# - lib32gcc-7-dev
-# - lib32stdc++-7-dev
-# Alternatively: treat 32bit builds like Windows and manually tweak aom_config.h
+# Toolchain for x86:
+# - gcc-i686-linux-gnu
+# - g++-i686-linux-gnu
set -eE
@@ -110,7 +109,8 @@ all_platforms+=" -DCONFIG_RUNTIME_CPU_DETECT=0"
toolchain="-DCMAKE_TOOLCHAIN_FILE=${SRC}/build/cmake/toolchains"
reset_dirs x86
-gen_config_files x86 "${toolchain}/x86-linux.cmake ${all_platforms} -DCONFIG_PIC=1"
+gen_config_files x86 \
+ "${toolchain}/i686-linux-gcc.cmake ${all_platforms} -DCONFIG_PIC=1"
# libaom_srcs.gni and aom_version.h are shared.
cp libaom_srcs.gni "${BASE}"
@@ -123,7 +123,8 @@ reset_dirs arm
gen_config_files arm "${toolchain}/armv7-linux-gcc.cmake ${all_platforms}"
reset_dirs arm64
-gen_config_files arm64 "${toolchain}/arm64-linux-gcc.cmake ${all_platforms}"
+gen_config_files arm64 "${toolchain}/arm64-linux-gcc.cmake ${all_platforms} \
+ -DENABLE_ARM_CRC32=0 -DENABLE_NEON_DOTPROD=0 -DENABLE_NEON_I8MM=0"
reset_dirs riscv64
gen_config_files riscv64 "${toolchain}/riscv-linux-gcc.cmake ${all_platforms}"
diff --git a/libaom_blocklist.txt b/libaom_blocklist.txt
index 06a721b0d..02b850846 100644
--- a/libaom_blocklist.txt
+++ b/libaom_blocklist.txt
@@ -26,3 +26,5 @@ fun:av1_get_perpixel_variance_facade
fun:highbd_10_variance_sse2
# libaom/av1/encoder/encodeframe_utils.c: indirect call to assembly code on x86/x86_64 platform
fun:fast_detect_non_zero_motion
+# libaom/av1/common/reconintra.c: indirect call to assembly code on x86/x86_64 platform
+fun:highbd_build_intra_predictors
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index a9f7f85ac..979ee6b8b 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -27,15 +27,15 @@ class ActiveMapTest
static const int kHeight = 144;
ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
- virtual ~ActiveMapTest() {}
+ ~ActiveMapTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
cpu_used_ = GET_PARAM(2);
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
diff --git a/test/allintra_end_to_end_test.cc b/test/allintra_end_to_end_test.cc
index 98a7973c7..8ec24aa68 100644
--- a/test/allintra_end_to_end_test.cc
+++ b/test/allintra_end_to_end_test.cc
@@ -56,25 +56,25 @@ class AllIntraEndToEndTest
deltaq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
tile_columns_(GET_PARAM(5)), enable_tx_size_search_(GET_PARAM(6)) {}
- virtual ~AllIntraEndToEndTest() {}
+ ~AllIntraEndToEndTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kAllIntra);
cfg_.g_threads = threads_;
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_ROW_MT, 1);
encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 002a20696..081123cbe 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -58,9 +58,9 @@ class AltRefFramePresenceTestLarge
rc_end_usage_(GET_PARAM(2)) {
is_arf_frame_present_ = 0;
}
- virtual ~AltRefFramePresenceTestLarge() {}
+ ~AltRefFramePresenceTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(altref_test_params_.encoding_mode);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -71,10 +71,10 @@ class AltRefFramePresenceTestLarge
cfg_.g_lag_in_frames = altref_test_params_.lag_in_frames;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -85,8 +85,8 @@ class AltRefFramePresenceTestLarge
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (is_arf_frame_present_ != 1 && AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -149,9 +149,9 @@ class GoldenFrameIntervalTestLarge
limit_ = 60;
frame_num_ = 0;
}
- virtual ~GoldenFrameIntervalTestLarge() {}
+ ~GoldenFrameIntervalTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(gf_interval_param_.encoding_mode);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -166,10 +166,10 @@ class GoldenFrameIntervalTestLarge
cfg_.rc_target_bitrate = 1000;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -189,7 +189,7 @@ class GoldenFrameIntervalTestLarge
}
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
(void)pkt;
++frame_num_;
}
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index b4a8b612b..674a883ea 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -32,16 +32,16 @@ class AqSegmentTest
public ::libaom_test::EncoderTest {
protected:
AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
- virtual ~AqSegmentTest() {}
+ ~AqSegmentTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
set_cpu_used_ = GET_PARAM(2);
aq_mode_ = 0;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
diff --git a/test/arf_freq_test.cc b/test/arf_freq_test.cc
index 63ccdfc26..f51444da4 100644
--- a/test/arf_freq_test.cc
+++ b/test/arf_freq_test.cc
@@ -80,9 +80,9 @@ class ArfFreqTestLarge
: EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
- virtual ~ArfFreqTestLarge() {}
+ ~ArfFreqTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(test_encode_param_.mode);
if (test_encode_param_.mode != ::libaom_test::kRealTime) {
cfg_.g_lag_in_frames = 25;
@@ -93,7 +93,7 @@ class ArfFreqTestLarge
}
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
min_run_ = ARF_NOT_SEEN;
run_of_visible_frames_ = 0;
}
@@ -115,7 +115,7 @@ class ArfFreqTestLarge
return frames;
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
const int frames = GetNumFramesInPkt(pkt);
if (frames == 1) {
@@ -134,8 +134,8 @@ class ArfFreqTestLarge
}
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
diff --git a/test/av1_c_vs_simd_encode.sh b/test/av1_c_vs_simd_encode.sh
new file mode 100644
index 000000000..cc547c890
--- /dev/null
+++ b/test/av1_c_vs_simd_encode.sh
@@ -0,0 +1,535 @@
+#!/bin/sh
+## Copyright (c) 2023, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This script checks the bit exactness between C and SIMD
+## implementations of AV1 encoder.
+
+PRESETS="good rt"
+LOWBD_CLIPS="yuv_raw_input yuv_480p_raw_input y4m_720p_input y4m_screen_input"
+HIGHBD_CLIPS="y4m_360p_10bit_input"
+OUT_FILE_SUFFIX=".ivf"
+SCRIPT_DIR=$(dirname "$0")
+LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd)
+devnull='> /dev/null 2>&1'
+
+# Clips used in test.
+YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_480P_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_640_480_30.yuv"
+Y4M_360P_10BIT_INPUT="${LIBAOM_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m"
+Y4M_720P_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+Y4M_SCREEN_INPUT="${LIBAOM_TEST_DATA_PATH}/wikipedia_420_360p_60f.y4m"
+
+# Number of frames to test.
+AV1_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=35
+
+# Create a temporary directory for output files.
+if [ -n "${TMPDIR}" ]; then
+ AOM_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+ AOM_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+ AOM_TEST_TEMP_ROOT=/tmp
+fi
+
+AOM_TEST_OUTPUT_DIR="${AOM_TEST_TEMP_ROOT}/av1_test_$$"
+
+if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
+ [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+ echo "${0##*/}: Cannot create output directory, giving up."
+ echo "${0##*/}: AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}"
+ exit 1
+fi
+
+elog() {
+ echo "$@" 1>&2
+}
+
+# Echoes path to $1 when it's executable and exists in ${AOM_TEST_OUTPUT_DIR},
+# or an empty string. Caller is responsible for testing the string once the
+# function returns.
+av1_enc_tool_path() {
+ local target="$1"
+ local preset="$2"
+ local tool_path="${AOM_TEST_OUTPUT_DIR}/build_target_${target}/aomenc_${preset}"
+
+ if [ ! -x "${tool_path}" ]; then
+ tool_path=""
+ fi
+ echo "${tool_path}"
+}
+
+# Environment check: Make sure input and source directories are available.
+av1_c_vs_simd_enc_verify_environment () {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${Y4M_720P_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${Y4M_SCREEN_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -d "$LIBAOM_SOURCE_DIR" ]; then
+ elog "LIBAOM_SOURCE_DIR does not exist."
+ return 1
+ fi
+}
+
+cleanup() {
+ rm -rf ${AOM_TEST_OUTPUT_DIR}
+}
+
+# Echo AOM_SIMD_CAPS_MASK for different instruction set architecture.
+avx512f() {
+ echo "0x1FF"
+}
+
+avx2() {
+ echo "0x0FF"
+}
+
+avx() {
+ echo "0x07F"
+}
+
+sse4_1() {
+ echo "0x03F"
+}
+
+ssse3() {
+ echo "0x01F"
+}
+
+sse3() {
+ echo "0x00F"
+}
+
+sse2() {
+ echo "0x007"
+}
+
+get_bitrates() {
+ local content=$1
+ local preset=$2
+
+ # Bit-rates:
+ local bitrate_lowres_good="100 1000"
+ local bitrate_480p_good="200 2000"
+ local bitrate_720p_good="600 6000"
+ local bitrate_scc_360p_good="400 1200"
+ local bitrate_lowres_rt="50 400"
+ local bitrate_480p_rt="100 1800"
+ local bitrate_720p_rt="150 2000"
+ local bitrate_scc_360p_rt="400 800"
+ local bitrate_hbd_360p="100 1600"
+
+ if [ "${preset}" = "good" ]; then
+ if [ "${content}" = "yuv_raw_input" ]; then
+ echo "${bitrate_lowres_good}"
+ elif [ "${content}" = "yuv_480p_raw_input" ]; then
+ echo "${bitrate_480p_good}"
+ elif [ "${content}" = "y4m_720p_input" ]; then
+ echo "${bitrate_720p_good}"
+ elif [ "${content}" = "y4m_screen_input" ]; then
+ echo "${bitrate_scc_360p_good}"
+ elif [ "${content}" = "y4m_360p_10bit_input" ]; then
+ echo "${bitrate_hbd_360p}"
+ else
+ elog "Invalid content"
+ fi
+ elif [ "${preset}" = "rt" ]; then
+ if [ "${content}" = "yuv_raw_input" ]; then
+ echo "${bitrate_lowres_rt}"
+ elif [ "${content}" = "yuv_480p_raw_input" ]; then
+ echo "${bitrate_480p_rt}"
+ elif [ "${content}" = "y4m_720p_input" ]; then
+ echo "${bitrate_720p_rt}"
+ elif [ "${content}" = "y4m_screen_input" ]; then
+ echo "${bitrate_scc_360p_rt}"
+ elif [ "${content}" = "y4m_360p_10bit_input" ]; then
+ echo "${bitrate_hbd_360p}"
+ else
+ elog "Invalid content"
+ fi
+ else
+ elog "invalid preset"
+ fi
+}
+
+# Echo clip details to be used as input to aomenc.
+yuv_raw_input() {
+ echo ""${YUV_RAW_INPUT}"
+ --width=352
+ --height=288
+ --bit-depth=8"
+}
+
+y4m_360p_10bit_input() {
+ echo ""${Y4M_360P_10BIT_INPUT}"
+ --bit-depth=10"
+}
+
+yuv_480p_raw_input() {
+ echo ""${YUV_480P_RAW_INPUT}"
+ --width=640
+ --height=480
+ --bit-depth=8"
+}
+
+y4m_720p_input() {
+ echo ""${Y4M_720P_INPUT}"
+ --bit-depth=8"
+}
+
+y4m_screen_input() {
+ echo ""${Y4M_SCREEN_INPUT}"
+ --tune-content=screen
+ --enable-palette=1
+ --bit-depth=8"
+}
+
+has_x86_isa_extn() {
+ instruction_set=$1
+ grep -q "$instruction_set" /proc/cpuinfo
+ if [ $? -eq 1 ]; then
+ return 1
+ fi
+}
+
+# Echo good encode params for use with AV1 encoder.
+av1_encode_good_params() {
+ echo "--good \
+ --ivf \
+ --profile=0 \
+ --static-thresh=0 \
+ --threads=1 \
+ --tile-columns=0 \
+ --tile-rows=0 \
+ --verbose \
+ --end-usage=vbr \
+ --kf-max-dist=160 \
+ --kf-min-dist=0 \
+ --max-q=63 \
+ --min-q=0 \
+ --overshoot-pct=100 \
+ --undershoot-pct=100 \
+ --passes=2 \
+ --arnr-maxframes=7 \
+ --arnr-strength=5 \
+ --auto-alt-ref=1 \
+ --drop-frame=0 \
+ --frame-parallel=0 \
+ --lag-in-frames=35 \
+ --maxsection-pct=2000 \
+ --minsection-pct=0 \
+ --sharpness=0"
+}
+
+# Echo realtime encode params for use with AV1 encoder.
+av1_encode_rt_params() {
+ echo "--rt \
+ --ivf \
+ --profile=0 \
+ --static-thresh=0 \
+ --threads=1 \
+ --tile-columns=0 \
+ --tile-rows=0 \
+ --verbose \
+ --end-usage=cbr \
+ --kf-max-dist=90000 \
+ --max-q=58 \
+ --min-q=2 \
+ --overshoot-pct=50 \
+ --undershoot-pct=50 \
+ --passes=1 \
+ --aq-mode=3 \
+ --buf-initial-sz=500 \
+ --buf-optimal-sz=600 \
+ --buf-sz=1000 \
+ --coeff-cost-upd-freq=3 \
+ --dv-cost-upd-freq=3 \
+ --mode-cost-upd-freq=3 \
+ --mv-cost-upd-freq=3 \
+ --deltaq-mode=0 \
+ --enable-global-motion=0 \
+ --enable-obmc=0 \
+ --enable-order-hint=0 \
+ --enable-ref-frame-mvs=0 \
+ --enable-tpl-model=0 \
+ --enable-warped-motion=0 \
+ --lag-in-frames=0 \
+ --max-intra-rate=300 \
+ --noise-sensitivity=0"
+}
+
+# Configures for the given target in AOM_TEST_OUTPUT_DIR/build_target_${target}
+# directory.
+av1_enc_build() {
+ local target="$1"
+ local cmake_command="$2"
+ local tmp_build_dir=${AOM_TEST_OUTPUT_DIR}/build_target_${target}
+ if [ -d "$tmp_build_dir" ]; then
+ rm -rf $tmp_build_dir
+ fi
+
+ mkdir -p $tmp_build_dir
+ cd $tmp_build_dir
+
+ local cmake_common_args="-DCONFIG_EXCLUDE_SIMD_MISMATCH=1 \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DENABLE_CCACHE=1 \
+ '-DCMAKE_C_FLAGS_RELEASE=-O3 -g' \
+ '-DCMAKE_CXX_FLAGS_RELEASE=-O3 -g'"
+
+ for preset in $PRESETS; do
+ echo "Building target[${preset} encoding]: ${target}"
+ if [ "${preset}" = "good" ]; then
+ local cmake_extra_args="-DCONFIG_AV1_HIGHBITDEPTH=1"
+ elif [ "${preset}" = "rt" ]; then
+ local cmake_extra_args="-DCONFIG_REALTIME_ONLY=1 -DCONFIG_AV1_HIGHBITDEPTH=0"
+ else
+ elog "Invalid preset"
+ return 1
+ fi
+ eval "$cmake_command" "${cmake_common_args}" "${cmake_extra_args}" ${devnull}
+ eval make -j$(nproc) ${devnull}
+ mv aomenc aomenc_${preset}
+ done
+ echo "Done building target: ${target}"
+}
+
+compare_enc_output() {
+ local target=$1
+ local cpu=$2
+ local clip=$3
+ local bitrate=$4
+ local preset=$5
+ diff ${AOM_TEST_OUTPUT_DIR}/Out-generic-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+ ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} > /dev/null
+ if [ $? -eq 1 ]; then
+ elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
+ return 1
+ fi
+}
+
+av1_enc_test() {
+ local encoder="$1"
+ local target="$2"
+ local preset="$3"
+ if [ -z "$(av1_enc_tool_path "${target}" "${preset}")" ]; then
+ elog "aomenc_{preset} not found. It must exist in ${AOM_TEST_OUTPUT_DIR}/build_target_${target} path"
+ return 1
+ fi
+
+ if [ "${preset}" = "good" ]; then
+ local min_cpu_used=0
+ local max_cpu_used=6
+ local test_params=av1_encode_good_params
+ if [ "${target}" = "armv8-linux-gcc" ]; then
+ # TODO(BUG=aomedia:3474): Enable testing of high bit-depth clips after
+ # fixing C vs SIMD mismatches.
+ local test_clips="${LOWBD_CLIPS}"
+ else
+ local test_clips="${LOWBD_CLIPS} ${HIGHBD_CLIPS}"
+ fi
+ elif [ "${preset}" = "rt" ]; then
+ local min_cpu_used=5
+ local max_cpu_used=10
+ local test_params=av1_encode_rt_params
+ local test_clips="${LOWBD_CLIPS}"
+ else
+ elog "Invalid preset"
+ return 1
+ fi
+
+ for cpu in $(seq $min_cpu_used $max_cpu_used); do
+ for clip in ${test_clips}; do
+ local test_bitrates=$(get_bitrates ${clip} ${preset})
+ for bitrate in ${test_bitrates}; do
+ eval "${encoder}" $($clip) $($test_params) \
+ "--limit=${AV1_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
+ "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \
+ ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+ ${devnull}
+
+ if [ "${target}" != "generic" ]; then
+ compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}
+ if [ $? -eq 1 ]; then
+ return 1
+ fi
+ fi
+ done
+ done
+ done
+}
+
+av1_test_generic() {
+ local arch=$1
+ local target="generic"
+ if [ $arch = "x86_64" ]; then
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR -DAOM_TARGET_CPU=${target}"
+ elif [ $arch = "x86" ]; then
+ # As AV1 encode output differs for x86 32-bit and 64-bit platforms
+ # (BUG=aomedia:3479), the x86 32-bit C-only build is generated separately.
+ # The cmake command line option -DENABLE_MMX=0 flag disables all SIMD
+ # optimizations, and generates a C-only binary.
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR -DENABLE_MMX=0 \
+ -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/${arch}-linux.cmake"
+ fi
+
+ echo "Build for: Generic ${arch}"
+ av1_enc_build "${target}" "${cmake_command}"
+
+ for preset in $PRESETS; do
+ local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+ av1_enc_test $encoder "${target}" "${preset}"
+ done
+}
+
+# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, AVX, AVX2 as there are
+# no functions with MMX, SSE and AVX512 specialization.
+# The value of environment variable 'AOM_SIMD_CAPS_MASK' controls enabling of different instruction
+# set extension optimizations. The value of the flag 'AOM_SIMD_CAPS_MASK' and the corresponding
+# instruction set extension optimization enabled are as follows:
+# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants
+# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants
+# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants
+# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants
+# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants
+# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants
+# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants
+# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants
+# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX
+## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "AOM_SIMD_CAPS_MASK" as
+# all x86_64 platforms implement sse2.
+av1_test_x86() {
+ local arch=$1
+
+ uname -m | grep -q "x86"
+ if [ $? -eq 1 ]; then
+ elog "Machine architecture is not x86 or x86_64"
+ return 0
+ fi
+
+ if [ $arch = "x86" ]; then
+ local target="x86-linux"
+ local cmake_command="cmake \
+ $LIBAOM_SOURCE_DIR \
+ -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/${target}.cmake"
+ elif [ $arch = "x86_64" ]; then
+ local target="x86_64-linux"
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR"
+ fi
+
+ local x86_isa_variants="avx2 avx sse4_1 ssse3 sse3 sse2"
+
+ echo "Build for x86: ${target}"
+ av1_enc_build "${target}" "${cmake_command}"
+
+ for preset in $PRESETS; do
+ local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+ for isa in $x86_isa_variants; do
+ has_x86_isa_extn $isa
+ if [ $? -eq 1 ]; then
+ echo "${isa} is not supported in this machine"
+ continue
+ fi
+ export AOM_SIMD_CAPS_MASK=$($isa)
+ av1_enc_test $encoder "${target}" "${preset}"
+ if [ $? -eq 1 ]; then
+ return 1
+ fi
+ unset AOM_SIMD_CAPS_MASK
+ done
+ done
+}
+
+av1_test_arm() {
+ local target="arm64-linux-gcc"
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR \
+ -DCMAKE_TOOLCHAIN_FILE=$LIBAOM_SOURCE_DIR/build/cmake/toolchains/${target}.cmake \
+ -DCMAKE_C_FLAGS=-Wno-maybe-uninitialized"
+ echo "Build for arm64: ${target}"
+ av1_enc_build "${target}" "${cmake_command}"
+
+ for preset in $PRESETS; do
+ # Enable armv8 test for real-time only
+ # TODO(BUG=aomedia:3486, BUG=aomedia:3474): Enable testing for 'good' preset
+ # after fixing C vs NEON mismatches.
+ if [ "${preset}" = "good" ]; then
+ continue
+ fi
+ local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+ av1_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" "${target}" "${preset}"
+ if [ $? -eq 1 ]; then
+ return 1
+ fi
+ done
+}
+
+av1_c_vs_simd_enc_test () {
+ # Test x86 (32 bit)
+ echo "av1 test for x86 (32 bit): Started."
+ # Encode 'C' only
+ av1_test_generic "x86"
+
+ # Encode with SIMD optimizations enabled
+ av1_test_x86 "x86"
+ if [ $? -eq 1 ]; then
+ echo "av1 test for x86 (32 bit): Done, test failed."
+ else
+ echo "av1 test for x86 (32 bit): Done, all tests passed."
+ fi
+
+ # Test x86_64 (64 bit)
+ if [ "$(eval uname -m)" = "x86_64" ]; then
+ echo "av1 test for x86_64 (64 bit): Started."
+ # Encode 'C' only
+ av1_test_generic "x86_64"
+ # Encode with SIMD optimizations enabled
+ av1_test_x86 "x86_64"
+ if [ $? -eq 1 ]; then
+ echo "av1 test for x86_64 (64 bit): Done, test failed."
+ else
+ echo "av1 test for x86_64 (64 bit): Done, all tests passed."
+ fi
+ fi
+
+ # Test ARM
+ echo "av1_test_arm: Started."
+ av1_test_arm
+ if [ $? -eq 1 ]; then
+ echo "av1 test for arm: Done, test failed."
+ else
+ echo "av1 test for arm: Done, all tests passed."
+ fi
+}
+
+# Setup a trap function to clean up build, and output files after tests complete.
+trap cleanup EXIT
+
+av1_c_vs_simd_enc_verify_environment
+if [ $? -eq 1 ]; then
+ echo "Environment check failed."
+ exit 1
+fi
+av1_c_vs_simd_enc_test
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index c321de26d..76cf77ab0 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -258,14 +258,13 @@ template <typename SrcPixel>
class ConvolveScaleTestBase : public ::testing::Test {
public:
ConvolveScaleTestBase() : image_(nullptr) {}
- virtual ~ConvolveScaleTestBase() { delete image_; }
- virtual void TearDown() {}
+ ~ConvolveScaleTestBase() override { delete image_; }
// Implemented by subclasses (SetUp depends on the parameters passed
// in and RunOne depends on the function to be tested. These can't
// be templated for low/high bit depths because they have different
// numbers of parameters)
- virtual void SetUp() = 0;
+ void SetUp() override = 0;
virtual void RunOne(bool ref) = 0;
protected:
@@ -407,9 +406,9 @@ class LowBDConvolveScaleTest
: public ConvolveScaleTestBase<uint8_t>,
public ::testing::WithParamInterface<LowBDParams> {
public:
- virtual ~LowBDConvolveScaleTest() {}
+ ~LowBDConvolveScaleTest() override = default;
- void SetUp() {
+ void SetUp() override {
tst_fun_ = GET_PARAM(0);
const BlockDimension &block = GET_PARAM(1);
@@ -421,7 +420,7 @@ class LowBDConvolveScaleTest
SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
}
- void RunOne(bool ref) {
+ void RunOne(bool ref) override {
const uint8_t *src = image_->GetSrcData(ref, false);
uint8_t *dst = image_->GetDstData(ref, false);
convolve_params_.dst = image_->GetDst16Data(ref, false);
@@ -490,9 +489,9 @@ class HighBDConvolveScaleTest
: public ConvolveScaleTestBase<uint16_t>,
public ::testing::WithParamInterface<HighBDParams> {
public:
- virtual ~HighBDConvolveScaleTest() {}
+ ~HighBDConvolveScaleTest() override = default;
- void SetUp() {
+ void SetUp() override {
tst_fun_ = GET_PARAM(0);
const BlockDimension &block = GET_PARAM(1);
@@ -504,7 +503,7 @@ class HighBDConvolveScaleTest
SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
}
- void RunOne(bool ref) {
+ void RunOne(bool ref) override {
const uint16_t *src = image_->GetSrcData(ref, false);
uint16_t *dst = image_->GetDstData(ref, false);
convolve_params_.dst = image_->GetDst16Data(ref, false);
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 873960d79..5bbac2180 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -183,14 +183,12 @@ TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) {
template <typename T>
class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
public:
- ~AV1ConvolveTest() override { TearDown(); }
+ ~AV1ConvolveTest() override = default;
void SetUp() override {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
}
- void TearDown() override {}
-
// Randomizes the 8-bit input buffer and returns a pointer to it. Note that
// the pointer is safe to use with an 8-tap filter. The stride can range
// from width to (width + kPadding). Also note that the pointer is to the
@@ -427,6 +425,99 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXTest,
BuildLowbdParams(av1_convolve_x_sr_neon));
#endif
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_neon_i8mm));
+#endif
+
+////////////////////////////////////////////////////////////////
+// Single reference convolve-x IntraBC functions (low bit-depth)
+////////////////////////////////////////////////////////////////
+
+class AV1ConvolveXIntraBCTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8.
+ constexpr int kSubX = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_convolve_x_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+ width, height, filter_params_x, kSubX,
+ &conv_params1);
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ test_func(input, width + 2, test, kOutputStride, width, height,
+ filter_params_x, kSubX, &conv_params2);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_convolve_x_sr_intrabc_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, 0, &conv_params1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, 0, &conv_params2);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveXIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXIntraBCTest,
+ BuildLowbdParams(av1_convolve_x_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXIntraBCTest,
+ BuildLowbdParams(av1_convolve_x_sr_intrabc_neon));
+#endif
+
#if CONFIG_AV1_HIGHBITDEPTH
/////////////////////////////////////////////////////////
// Single reference convolve-x functions (high bit-depth)
@@ -540,6 +631,94 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXHighbdTest,
BuildHighbdParams(av1_highbd_convolve_x_sr_neon));
#endif
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-x IntraBC functions (high bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1ConvolveXHighbdIntraBCTest
+ : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8.
+ constexpr int kSubX = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_highbd_convolve_x_sr_intrabc_c(
+ input, width + 2, reference, kOutputStride, width, height,
+ filter_params_x, kSubX, &conv_params1, bit_depth);
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_x, kSubX, &conv_params2,
+ bit_depth);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_highbd_convolve_x_sr_intrabc_c(input, width, reference, kOutputStride,
+ width, height, filter_params_x, 0,
+ &conv_params1, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ highbd_convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, 0, &conv_params2, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveXHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1ConvolveXHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_intrabc_neon));
+#endif
+
#endif // CONFIG_AV1_HIGHBITDEPTH
////////////////////////////////////////////////////////
@@ -643,6 +822,80 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYTest,
BuildLowbdParams(av1_convolve_y_sr_neon));
#endif
+////////////////////////////////////////////////////////////////
+// Single reference convolve-y IntraBC functions (low bit-depth)
+////////////////////////////////////////////////////////////////
+
+class AV1ConvolveYIntraBCTest : public AV1ConvolveTest<convolve_y_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_y_qn = 8.
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_convolve_y_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+ width, height, filter_params_y, kSubY);
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_y, kSubY);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_convolve_y_sr_intrabc_c(input, width, reference, kOutputStride, width,
+ height, filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ convolve_y_func test_func = GetParam().TestFunction();
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveYIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYIntraBCTest,
+ BuildLowbdParams(av1_convolve_y_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYIntraBCTest,
+ BuildLowbdParams(av1_convolve_y_sr_intrabc_neon));
+#endif
+
#if CONFIG_AV1_HIGHBITDEPTH
/////////////////////////////////////////////////////////
// Single reference convolve-y functions (high bit-depth)
@@ -745,6 +998,86 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYHighbdTest,
BuildHighbdParams(av1_highbd_convolve_y_sr_neon));
#endif
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-y IntraBC functions (high bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1ConvolveYHighbdIntraBCTest
+ : public AV1ConvolveTest<highbd_convolve_y_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_y_qn = 8.
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_highbd_convolve_y_sr_intrabc_c(input, width + 2, reference,
+ kOutputStride, width, height,
+ filter_params_y, kSubY, bit_depth);
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_y, kSubY, bit_depth);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter, width);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_highbd_convolve_y_sr_intrabc_c(input, width, reference, kOutputStride,
+ width, height, filter_params_y, 0,
+ bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ highbd_convolve_y_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_y, 0, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveYHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1ConvolveYHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_intrabc_neon));
+#endif
+
#endif // CONFIG_AV1_HIGHBITDEPTH
//////////////////////////////////////////////////////////////
@@ -830,6 +1163,11 @@ INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyHighbdTest,
BuildHighbdParams(aom_highbd_convolve_copy_avx2));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyHighbdTest,
+ BuildHighbdParams(aom_highbd_convolve_copy_neon));
+#endif
+
#endif // CONFIG_AV1_HIGHBITDEPTH
/////////////////////////////////////////////////////////
@@ -958,6 +1296,104 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DTest,
BuildLowbdParams(av1_convolve_2d_sr_neon));
#endif
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_neon_i8mm));
+#endif
+
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-2D IntraBC functions (low bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1Convolve2DIntraBCTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8 and subpel_y_qn = 8.
+ constexpr int kSubX = 8;
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_convolve_2d_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+ width, height, filter_params_x,
+ filter_params_y, kSubX, kSubY, &conv_params1);
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_x, filter_params_y, kSubX,
+ kSubY, &conv_params2);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter h_f = static_cast<InterpFilter>(BILINEAR);
+ const InterpFilter v_f = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_convolve_2d_sr_intrabc_c(input, width, reference, kOutputStride,
+ width, height, filter_params_x,
+ filter_params_y, 8, 8, &conv_params1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ convolve_2d_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, filter_params_y, 8, 8, &conv_params2);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+ time1, time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1Convolve2DIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DIntraBCTest,
+ BuildLowbdParams(av1_convolve_2d_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DIntraBCTest,
+ BuildLowbdParams(av1_convolve_2d_sr_intrabc_neon));
+#endif
+
#if CONFIG_AV1_HIGHBITDEPTH
//////////////////////////////////////////////////////////
// Single reference convolve-2d functions (high bit-depth)
@@ -1087,6 +1523,103 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DHighbdTest,
BuildHighbdParams(av1_highbd_convolve_2d_sr_neon));
#endif
+//////////////////////////////////////////////////////////////////
+// Single reference convolve-2d IntraBC functions (high bit-depth)
+//////////////////////////////////////////////////////////////////
+
+class AV1Convolve2DHighbdIntraBCTest
+ : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8 and subpel_y_qn = 8.
+ constexpr int kSubX = 8;
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_highbd_convolve_2d_sr_intrabc_c(input, width + 2, reference,
+ kOutputStride, width, height,
+ filter_params_x, filter_params_y, kSubX,
+ kSubY, &conv_params1, bit_depth);
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_x, filter_params_y, kSubX,
+ kSubY, &conv_params2, bit_depth);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter h_f = static_cast<InterpFilter>(BILINEAR);
+ const InterpFilter v_f = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_highbd_convolve_2d_sr_intrabc_c(
+ input, width, reference, kOutputStride, width, height,
+ filter_params_x, filter_params_y, 0, 0, &conv_params1, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ highbd_convolve_2d_func test_func = GetParam().TestFunction();
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, filter_params_y, 0, 0, &conv_params2,
+ bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+ time1, time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1Convolve2DHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1Convolve2DHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1Convolve2DHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_intrabc_neon));
+#endif
+
#endif // CONFIG_AV1_HIGHBITDEPTH
//////////////////////////
@@ -1304,6 +1837,18 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXCompoundTest,
BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon));
#endif
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+ NEON_I8MM, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon_i8mm));
+#endif
+
#if CONFIG_AV1_HIGHBITDEPTH
/////////////////////////////////////////////////
// Compound convolve-x functions (high bit-depth)
@@ -1787,6 +2332,18 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DCompoundTest,
BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon));
#endif
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+ NEON_I8MM, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon_i8mm));
+#endif
+
#if CONFIG_AV1_HIGHBITDEPTH
//////////////////////////////////////////////////
// Compound convolve-2d functions (high bit-depth)
diff --git a/test/av1_encoder_parms_get_to_decoder.cc b/test/av1_encoder_parms_get_to_decoder.cc
index e81ad87e7..402e70c34 100644
--- a/test/av1_encoder_parms_get_to_decoder.cc
+++ b/test/av1_encoder_parms_get_to_decoder.cc
@@ -85,17 +85,17 @@ class AVxEncoderParmsGetToDecoder
AVxEncoderParmsGetToDecoder()
: EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
- virtual ~AVxEncoderParmsGetToDecoder() {}
+ ~AVxEncoderParmsGetToDecoder() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kTwoPassGood);
cfg_.g_lag_in_frames = 25;
test_video_ = kAV1ParamPassingTestVector;
cfg_.rc_target_bitrate = test_video_.bitrate;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 3);
encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries);
@@ -113,8 +113,8 @@ class AVxEncoderParmsGetToDecoder
}
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t pts) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
(void)pts;
if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
EXPECT_EQ(encode_parms.render_size[0], (int)img.r_w);
@@ -127,14 +127,14 @@ class AVxEncoderParmsGetToDecoder
EXPECT_EQ(encode_parms.chroma_sample_position, img.csp);
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
if (encode_parms.lossless) {
EXPECT_EQ(kMaxPsnr, pkt->data.psnr.psnr[0]);
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
return AOM_CODEC_OK == res_dec;
}
diff --git a/test/av1_ext_tile_test.cc b/test/av1_ext_tile_test.cc
index 5eaf38222..59c44cad1 100644
--- a/test/av1_ext_tile_test.cc
+++ b/test/av1_ext_tile_test.cc
@@ -58,12 +58,12 @@ class AV1ExtTileTest
tile_md5_.clear();
}
- virtual ~AV1ExtTileTest() {
+ ~AV1ExtTileTest() override {
aom_img_free(&tile_img_);
delete decoder_;
}
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
cfg_.g_lag_in_frames = 0;
@@ -74,8 +74,8 @@ class AV1ExtTileTest
cfg_.rc_min_quantizer = 0;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
// Encode setting
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -96,8 +96,8 @@ class AV1ExtTileTest
}
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t pts) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
// Skip 1 already decoded frame to be consistent with the decoder in this
// test.
if (pts == (aom_codec_pts_t)kSkip) return;
@@ -108,7 +108,7 @@ class AV1ExtTileTest
md5_.push_back(md5_res.Get());
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
// Skip decoding 1 frame.
if (pkt->data.frame.pts == (aom_codec_pts_t)kSkip) return;
diff --git a/test/av1_external_partition_test.cc b/test/av1_external_partition_test.cc
index 41fc96c05..88f6216fa 100644
--- a/test/av1_external_partition_test.cc
+++ b/test/av1_external_partition_test.cc
@@ -247,9 +247,9 @@ class ExternalPartitionTestAPI
ExternalPartitionTestAPI()
: EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
- virtual ~ExternalPartitionTestAPI() {}
+ ~ExternalPartitionTestAPI() override {}
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -260,14 +260,14 @@ class ExternalPartitionTestAPI
init_flags_ = AOM_CODEC_USE_PSNR;
}
- virtual bool DoDecode() const { return false; }
+ bool DoDecode() const override { return false; }
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
@@ -287,8 +287,8 @@ class ExternalPartitionTestAPI
decision_mode_ = mode;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
if (decision_mode_ == AOM_EXT_PART_WHOLE_TREE) {
aom_ext_part_funcs_t ext_part_funcs;
@@ -559,9 +559,9 @@ class ExternalPartitionTestDfsAPI
ExternalPartitionTestDfsAPI()
: EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
- virtual ~ExternalPartitionTestDfsAPI() {}
+ ~ExternalPartitionTestDfsAPI() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -572,14 +572,14 @@ class ExternalPartitionTestDfsAPI
init_flags_ = AOM_CODEC_USE_PSNR;
}
- virtual bool DoDecode() const { return false; }
+ bool DoDecode() const override { return false; }
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
@@ -597,8 +597,8 @@ class ExternalPartitionTestDfsAPI
test_send_features_ = test_send_features;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
aom_ext_part_funcs_t ext_part_funcs;
ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
diff --git a/test/av1_fwd_txfm1d_test.cc b/test/av1_fwd_txfm1d_test.cc
index 885a6dbc1..6bae9f836 100644
--- a/test/av1_fwd_txfm1d_test.cc
+++ b/test/av1_fwd_txfm1d_test.cc
@@ -41,7 +41,7 @@ const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
};
// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
-const int8_t cos_bit = 14;
+const int8_t cos_bit = 13;
const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
TEST(av1_fwd_txfm1d, round_shift) {
@@ -56,7 +56,7 @@ TEST(av1_fwd_txfm1d, round_shift) {
}
TEST(av1_fwd_txfm1d, av1_cospi_arr_data) {
- for (int i = 0; i < 7; i++) {
+ for (int i = 0; i < 4; i++) {
for (int j = 0; j < 64; j++) {
EXPECT_EQ(av1_cospi_arr_data[i][j],
(int32_t)round(cos(PI * j / 128) * (1 << (cos_bit_min + i))));
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index 7b84eb9c6..2ed5d94db 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc
@@ -38,7 +38,7 @@ typedef std::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
tx_type_ = GET_PARAM(0);
tx_size_ = GET_PARAM(1);
max_error_ = GET_PARAM(2);
@@ -116,7 +116,7 @@ class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
<< "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(input_);
aom_free(output_);
aom_free(ref_input_);
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index dae53ea54..2c57362a8 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -63,9 +63,9 @@ typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, TX_TYPE, int> IHbdHtParam;
class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
public:
- virtual ~AV1HighbdInvHTNxN() {}
+ ~AV1HighbdInvHTNxN() override = default;
- virtual void SetUp() {
+ void SetUp() override {
txfm_ref_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
inv_txfm_ref_ = GET_PARAM(2);
@@ -92,7 +92,7 @@ class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
ASSERT_NE(output_ref_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(input_);
aom_free(coeffs_);
aom_free(output_);
@@ -200,7 +200,7 @@ typedef std::tuple<const HighbdInvTxfm2dFunc> AV1HighbdInvTxfm2dParam;
class AV1HighbdInvTxfm2d
: public ::testing::TestWithParam<AV1HighbdInvTxfm2dParam> {
public:
- virtual void SetUp() { target_func_ = GET_PARAM(0); }
+ void SetUp() override { target_func_ = GET_PARAM(0); }
void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times,
int bit_depth, int gt_int16 = 0);
diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
index 28ee534d4..e9cf02e20 100644
--- a/test/av1_horz_only_frame_superres_test.cc
+++ b/test/av1_horz_only_frame_superres_test.cc
@@ -162,14 +162,13 @@ template <typename Pixel>
class ConvolveHorizRSTestBase : public ::testing::Test {
public:
ConvolveHorizRSTestBase() : image_(nullptr) {}
- virtual ~ConvolveHorizRSTestBase() {}
- virtual void TearDown() {}
+ ~ConvolveHorizRSTestBase() override = default;
// Implemented by subclasses (SetUp depends on the parameters passed
// in and RunOne depends on the function to be tested. These can't
// be templated for low/high bit depths because they have different
// numbers of parameters)
- virtual void SetUp() = 0;
+ void SetUp() override = 0;
virtual void RunOne(bool ref) = 0;
protected:
@@ -261,15 +260,15 @@ class LowBDConvolveHorizRSTest
: public ConvolveHorizRSTestBase<uint8_t>,
public ::testing::WithParamInterface<LowBDParams> {
public:
- virtual ~LowBDConvolveHorizRSTest() {}
+ ~LowBDConvolveHorizRSTest() override = default;
- void SetUp() {
+ void SetUp() override {
tst_fun_ = GET_PARAM(0);
const int bd = 8;
SetBitDepth(bd);
}
- void RunOne(bool ref) {
+ void RunOne(bool ref) override {
const uint8_t *src = image_->GetSrcData(ref, false);
uint8_t *dst = image_->GetDstData(ref, false);
const int src_stride = image_->src_stride();
@@ -322,15 +321,15 @@ class HighBDConvolveHorizRSTest
: public ConvolveHorizRSTestBase<uint16_t>,
public ::testing::WithParamInterface<HighBDParams> {
public:
- virtual ~HighBDConvolveHorizRSTest() {}
+ ~HighBDConvolveHorizRSTest() override = default;
- void SetUp() {
+ void SetUp() override {
tst_fun_ = GET_PARAM(0);
const int bd = GET_PARAM(1);
SetBitDepth(bd);
}
- void RunOne(bool ref) {
+ void RunOne(bool ref) override {
const uint16_t *src = image_->GetSrcData(ref, false);
uint16_t *dst = image_->GetDstData(ref, false);
const int src_stride = image_->src_stride();
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index dfa0481d9..35a87a43b 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -49,7 +49,7 @@ typedef std::tuple<TxType, TxSize, int, double> AV1InvTxfm2dParam;
class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
tx_type_ = GET_PARAM(0);
tx_size_ = GET_PARAM(1);
max_error_ = GET_PARAM(2);
@@ -249,7 +249,7 @@ TEST(AV1InvTxfm2d, CfgTest) {
typedef std::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
public:
- virtual void SetUp() { target_func_ = GET_PARAM(0); }
+ void SetUp() override { target_func_ = GET_PARAM(0); }
void RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, int run_times,
int gt_int16 = 0);
@@ -393,8 +393,6 @@ INSTANTIATE_TEST_SUITE_P(AVX2, AV1LbdInvTxfm2d,
::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
#endif // HAVE_AVX2
-// TODO(yunqing): Re-enable this unit test for NEON version after the functions
-// are fixed.
#if HAVE_NEON
extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
uint8_t *output, int stride,
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index 99f0fba99..7e66a8e01 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -46,10 +46,8 @@ typedef std::tuple<av1_calc_indices_dim2_func, BLOCK_SIZE>
class AV1KmeansTest1
: public ::testing::TestWithParam<av1_calc_indices_dim1Param> {
public:
- ~AV1KmeansTest1();
- void SetUp();
-
- void TearDown();
+ ~AV1KmeansTest1() override;
+ void SetUp() override;
protected:
void RunCheckOutput(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
@@ -75,7 +73,7 @@ class AV1KmeansTest1
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest1);
-AV1KmeansTest1::~AV1KmeansTest1() {}
+AV1KmeansTest1::~AV1KmeansTest1() = default;
void AV1KmeansTest1::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -87,8 +85,6 @@ void AV1KmeansTest1::SetUp() {
}
}
-void AV1KmeansTest1::TearDown() {}
-
void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
BLOCK_SIZE bsize, int k) {
const int w = block_size_wide[bsize];
@@ -152,10 +148,8 @@ TEST_P(AV1KmeansTest1, DISABLED_Speed) {
class AV1KmeansTest2
: public ::testing::TestWithParam<av1_calc_indices_dim2Param> {
public:
- ~AV1KmeansTest2();
- void SetUp();
-
- void TearDown();
+ ~AV1KmeansTest2() override;
+ void SetUp() override;
protected:
void RunCheckOutput(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
@@ -185,7 +179,7 @@ class AV1KmeansTest2
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest2);
-AV1KmeansTest2::~AV1KmeansTest2() {}
+AV1KmeansTest2::~AV1KmeansTest2() = default;
void AV1KmeansTest2::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -197,8 +191,6 @@ void AV1KmeansTest2::SetUp() {
}
}
-void AV1KmeansTest2::TearDown() {}
-
void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
BLOCK_SIZE bsize, int k) {
const int w = block_size_wide[bsize];
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index 48504c80b..4201ea6ce 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc
@@ -34,7 +34,7 @@ const float epsilon = 1e-3f; // Error threshold for functional equivalence
class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
const int MAX_NODES2 = NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
// Allocate two massive buffers on the heap for edge weights and node bias
// Then set-up the double-dimension arrays pointing into the big buffers
@@ -51,7 +51,7 @@ class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
}
target_func_ = GET_PARAM(0);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(weights_buf);
aom_free(bias_buf);
}
@@ -65,8 +65,8 @@ class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
private:
NnPredict_Func target_func_;
libaom_test::ACMRandom rng_;
- float *weights[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
- float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
+ float *weights[NN_MAX_HIDDEN_LAYERS + 1] = {};
+ float *bias[NN_MAX_HIDDEN_LAYERS + 1] = {};
float *weights_buf = nullptr, *bias_buf = nullptr;
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(NnPredictTest);
@@ -176,13 +176,15 @@ void NnPredictTest::RunNnPredictSpeedTest(const NN_CONFIG *const shape,
// runs of the encoder. It also conveniently covers all the kernels
// implemented.
static const NN_CONFIG kShapes[] = {
- { 10, 16, 1, { 64 }, { 0 }, { 0 } }, { 12, 1, 1, { 12 }, { 0 }, { 0 } },
- { 12, 1, 1, { 24 }, { 0 }, { 0 } }, { 12, 1, 1, { 32 }, { 0 }, { 0 } },
- { 18, 4, 1, { 24 }, { 0 }, { 0 } }, { 18, 4, 1, { 32 }, { 0 }, { 0 } },
- { 4, 1, 1, { 16 }, { 0 }, { 0 } }, { 8, 1, 1, { 16 }, { 0 }, { 0 } },
- { 8, 4, 1, { 16 }, { 0 }, { 0 } }, { 8, 1, 1, { 24 }, { 0 }, { 0 } },
- { 8, 1, 1, { 32 }, { 0 }, { 0 } }, { 8, 1, 1, { 64 }, { 0 }, { 0 } },
- { 9, 3, 1, { 32 }, { 0 }, { 0 } }, { 4, 4, 1, { 8 }, { 0 }, { 0 } },
+ { 37, 1, 2, { 16, 24 }, {}, {} }, { 24, 24, 1, { 12 }, {}, {} },
+ { 10, 16, 1, { 64 }, {}, {} }, { 12, 1, 1, { 12 }, {}, {} },
+ { 12, 1, 1, { 24 }, {}, {} }, { 12, 1, 1, { 32 }, {}, {} },
+ { 18, 4, 1, { 24 }, {}, {} }, { 18, 4, 1, { 32 }, {}, {} },
+ { 4, 1, 1, { 16 }, {}, {} }, { 8, 1, 0, { 0 }, {}, {} },
+ { 8, 4, 1, { 16 }, {}, {} }, { 8, 1, 1, { 32 }, {}, {} },
+ { 9, 3, 1, { 32 }, {}, {} }, { 8, 4, 0, { 0 }, {}, {} },
+ { 8, 8, 0, { 0 }, {}, {} }, { 4, 4, 1, { 8 }, {}, {} },
+ { 4, 3, 0, { 64 }, {}, {} },
};
void NnPredictTest::RunNnPredictTest_all(const NN_CONFIG *const shapes,
@@ -206,14 +208,21 @@ TEST_P(NnPredictTest, DISABLED_Speed) {
10000000);
}
-#if HAVE_SSE3 && !CONFIG_EXCLUDE_SIMD_MISMATCH
+#if !CONFIG_EXCLUDE_SIMD_MISMATCH
+#if HAVE_SSE3
INSTANTIATE_TEST_SUITE_P(SSE3, NnPredictTest,
::testing::Values(av1_nn_predict_sse3));
#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, NnPredictTest,
+ ::testing::Values(av1_nn_predict_avx2));
+#endif
+
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(NEON, NnPredictTest,
::testing::Values(av1_nn_predict_neon));
#endif
+#endif // !CONFIG_EXCLUDE_SIMD_MISMATCH
} // namespace
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 582364703..c8af14a35 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -183,11 +183,9 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
}
}
- virtual void SetUp() { params_ = GetParam(); }
+ void SetUp() override { params_ = GetParam(); }
- virtual void TearDown() {}
-
- virtual ~AV1QuantizeTest() {}
+ ~AV1QuantizeTest() override = default;
private:
TX_SIZE getTxSize(int count) {
diff --git a/test/av1_round_shift_array_test.cc b/test/av1_round_shift_array_test.cc
index facb84b55..937e8645a 100644
--- a/test/av1_round_shift_array_test.cc
+++ b/test/av1_round_shift_array_test.cc
@@ -39,10 +39,11 @@ typedef std::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
class AV1CompRoundShiftTest
: public ::testing::TestWithParam<CompRoundShiftParam> {
public:
- ~AV1CompRoundShiftTest();
+ ~AV1CompRoundShiftTest() override;
- void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
- void TearDown() {}
+ void SetUp() override {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ }
protected:
void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
@@ -54,7 +55,7 @@ class AV1CompRoundShiftTest
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompRoundShiftTest);
-AV1CompRoundShiftTest::~AV1CompRoundShiftTest() {}
+AV1CompRoundShiftTest::~AV1CompRoundShiftTest() = default;
void AV1CompRoundShiftTest::RunCheckOutput(
comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) {
diff --git a/test/av1_softmax_test.cc b/test/av1_softmax_test.cc
index 60c7b6f81..2b04af134 100644
--- a/test/av1_softmax_test.cc
+++ b/test/av1_softmax_test.cc
@@ -35,7 +35,7 @@ constexpr float kAbsEpsilon = 5e-3f;
class FastSoftmaxTest : public ::testing::TestWithParam<FastSoftmaxTestParams> {
public:
FastSoftmaxTest() : target_fn_(GET_PARAM(0)), num_classes_(GET_PARAM(1)) {}
- virtual void SetUp() {
+ void SetUp() override {
ref_buf_.reset(new (std::nothrow) float[num_classes_]());
ASSERT_NE(ref_buf_, nullptr);
dst_buf_.reset(new (std::nothrow) float[num_classes_]());
diff --git a/test/av1_temporal_denoiser_test.cc b/test/av1_temporal_denoiser_test.cc
index 571fd926a..7aa8fb6a6 100644
--- a/test/av1_temporal_denoiser_test.cc
+++ b/test/av1_temporal_denoiser_test.cc
@@ -43,11 +43,9 @@ class AV1DenoiserTest
: public ::testing::Test,
public ::testing::WithParamInterface<AV1DenoiserTestParam> {
public:
- virtual ~AV1DenoiserTest() {}
+ ~AV1DenoiserTest() override = default;
- virtual void SetUp() { bs_ = GET_PARAM(1); }
-
- virtual void TearDown() {}
+ void SetUp() override { bs_ = GET_PARAM(1); }
protected:
BLOCK_SIZE bs_;
diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index 46f6d923c..1055ff35b 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc
@@ -379,6 +379,16 @@ INSTANTIATE_TEST_SUITE_P(
NEON, WedgeUtilsSSEOptTest,
::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
av1_wedge_sse_from_residuals_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, WedgeUtilsSignOptTest,
+ ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+ av1_wedge_sign_from_residuals_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, WedgeUtilsDeltaSquaresOptTest,
+ ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
+ av1_wedge_compute_delta_squares_neon)));
#endif // HAVE_NEON
#if HAVE_AVX2
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 886591583..d7817a858 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -35,7 +35,7 @@ class AverageTestBase : public ::testing::Test {
: width_(width), height_(height), source_data_(nullptr),
source_stride_(0), bit_depth_(bit_depth) {}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(source_data_);
source_data_ = nullptr;
}
@@ -47,7 +47,7 @@ class AverageTestBase : public ::testing::Test {
static const int kDataBlockHeight = 128;
static const int kDataBlockSize = kDataBlockWidth * kDataBlockHeight;
- virtual void SetUp() {
+ void SetUp() override {
const testing::TestInfo *const test_info =
testing::UnitTest::GetInstance()->current_test_info();
// Skip the speed test for C code as the baseline uses the same function.
@@ -378,7 +378,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
}
protected:
- virtual void SetUp() {
+ void SetUp() override {
source_data_ = static_cast<uint8_t *>(
aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
ASSERT_NE(source_data_, nullptr);
@@ -391,7 +391,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
ASSERT_NE(hbuf_c_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(source_data_);
source_data_ = nullptr;
aom_free(hbuf_c_);
@@ -469,7 +469,7 @@ class IntProColTest : public AverageTestBase<uint8_t>,
}
protected:
- virtual void SetUp() {
+ void SetUp() override {
source_data_ = static_cast<uint8_t *>(
aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
ASSERT_NE(source_data_, nullptr);
@@ -482,7 +482,7 @@ class IntProColTest : public AverageTestBase<uint8_t>,
ASSERT_NE(vbuf_c_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(source_data_);
source_data_ = nullptr;
aom_free(vbuf_c_);
@@ -582,13 +582,13 @@ TEST_P(IntProColTest, DISABLED_Speed) {
class VectorVarTestBase : public ::testing::Test {
public:
explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
- VectorVarTestBase() {}
- ~VectorVarTestBase() {}
+ VectorVarTestBase() = default;
+ ~VectorVarTestBase() override = default;
protected:
static const int kDataAlignment = 16;
- virtual void SetUp() {
+ void SetUp() override {
width = 4 << m_bwl;
ref_vector = static_cast<int16_t *>(
@@ -600,7 +600,7 @@ class VectorVarTestBase : public ::testing::Test {
rnd_.Reset(ACMRandom::DeterministicSeed());
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(ref_vector);
ref_vector = nullptr;
aom_free(src_vector);
@@ -883,13 +883,13 @@ class SatdTestBase
satd_func_ref_ = func_param.func_ref;
satd_func_simd_ = func_param.func_simd;
}
- virtual void SetUp() {
+ void SetUp() override {
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<CoeffType *>(
aom_memalign(32, sizeof(*src_) * satd_size_));
ASSERT_NE(src_, nullptr);
}
- virtual void TearDown() { aom_free(src_); }
+ void TearDown() override { aom_free(src_); }
void FillConstant(const CoeffType val) {
for (int i = 0; i < satd_size_; ++i) src_[i] = val;
}
@@ -963,13 +963,13 @@ class SatdTest : public SatdTestBase<tran_low_t, SatdFunc> {
};
TEST_P(SatdTest, MinValue) {
- const int kMin = -32640;
+ const int kMin = -524287;
const int expected = -kMin * satd_size_;
FillConstant(kMin);
Check(expected);
}
TEST_P(SatdTest, MaxValue) {
- const int kMax = 32640;
+ const int kMax = 524287;
const int expected = kMax * satd_size_;
FillConstant(kMax);
Check(expected);
diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
index 9a9598704..f9549bccb 100644
--- a/test/blend_a64_mask_1d_test.cc
+++ b/test/blend_a64_mask_1d_test.cc
@@ -41,13 +41,13 @@ class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
static const int kMaxMaskSize = kMaxMaskWidth;
- virtual ~BlendA64Mask1DTest() {}
+ ~BlendA64Mask1DTest() override = default;
virtual void Execute(const T *p_src0, const T *p_src1) = 0;
- void Common() {
- w_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
- h_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
+ void Common(int block_size) {
+ w_ = block_size_wide[block_size];
+ h_ = block_size_high[block_size];
dst_offset_ = this->rng_(33);
dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
@@ -121,7 +121,7 @@ typedef libaom_test::FuncParam<F8B> TestFuncs;
class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
protected:
- void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
+ void Execute(const uint8_t *p_src0, const uint8_t *p_src1) override {
params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
w_, h_);
@@ -132,7 +132,7 @@ class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
};
TEST_P(BlendA64Mask1DTest8B, RandomValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
for (int i = 0; i < kBufSize; ++i) {
dst_ref_[i] = rng_.Rand8();
dst_tst_[i] = rng_.Rand8();
@@ -144,23 +144,23 @@ TEST_P(BlendA64Mask1DTest8B, RandomValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- Common();
+ Common(bsize);
}
}
TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = rng_(2) + 254;
- dst_tst_[i] = rng_(2) + 254;
- src0_[i] = rng_(2) + 254;
- src1_[i] = rng_(2) + 254;
- }
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(2) + 254;
+ dst_tst_[i] = rng_(2) + 254;
+ src0_[i] = rng_(2) + 254;
+ src1_[i] = rng_(2) + 254;
+ }
- for (int i = 0; i < kMaxMaskSize; ++i)
- mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
- Common();
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ Common(bsize);
}
}
@@ -227,7 +227,7 @@ typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
protected:
- void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1) override {
params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
@@ -243,37 +243,27 @@ class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
};
TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- switch (rng_(3)) {
- case 0: bit_depth_ = 8; break;
- case 1: bit_depth_ = 10; break;
- default: bit_depth_ = 12; break;
- }
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const int hi = 1 << bit_depth_;
+
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(hi);
+ dst_tst_[i] = rng_(hi);
+ src0_[i] = rng_(hi);
+ src1_[i] = rng_(hi);
+ }
- const int hi = 1 << bit_depth_;
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = rng_(hi);
- dst_tst_[i] = rng_(hi);
- src0_[i] = rng_(hi);
- src1_[i] = rng_(hi);
+ Common(bsize);
}
-
- for (int i = 0; i < kMaxMaskSize; ++i)
- mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
- Common();
}
}
TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
- for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
- switch (rng_(3)) {
- case 0: bit_depth_ = 8; break;
- case 1: bit_depth_ = 10; break;
- default: bit_depth_ = 12; break;
- }
-
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
const int hi = 1 << bit_depth_;
const int lo = hi - 2;
@@ -287,7 +277,9 @@ TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
- Common();
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ Common(bsize);
+ }
}
}
@@ -336,5 +328,15 @@ INSTANTIATE_TEST_SUITE_P(
TestFuncsHBD(highbd_blend_a64_vmask_ref,
aom_highbd_blend_a64_vmask_sse4_1)));
#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64Mask1DTestHBD,
+ ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+ aom_highbd_blend_a64_hmask_neon),
+ TestFuncsHBD(highbd_blend_a64_vmask_ref,
+ aom_highbd_blend_a64_vmask_neon)));
+#endif // HAVE_NEON
+
#endif // CONFIG_AV1_HIGHBITDEPTH
} // namespace
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 9dece5737..fafc7f032 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -41,7 +41,7 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
- virtual ~BlendA64MaskTest() {}
+ ~BlendA64MaskTest() override = default;
virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1,
int run_times) = 0;
@@ -123,9 +123,11 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
}
void RunTest(int block_size, int run_times) {
- subx_ = Rand1();
- suby_ = Rand1();
- RunOneTest(block_size, subx_, suby_, run_times);
+ for (subx_ = 0; subx_ <= 1; subx_++) {
+ for (suby_ = 0; suby_ <= 1; suby_++) {
+ RunOneTest(block_size, subx_, suby_, run_times);
+ }
+ }
}
DstPixel dst_ref_[kBufSize];
@@ -163,7 +165,8 @@ typedef libaom_test::FuncParam<F8B> TestFuncs;
class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
protected:
- void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) {
+ void Execute(const uint8_t *p_src0, const uint8_t *p_src1,
+ int run_times) override {
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < run_times; ++i) {
@@ -193,8 +196,7 @@ class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B);
TEST_P(BlendA64MaskTest8B, RandomValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize) {
for (int i = 0; i < kBufSize; ++i) {
dst_ref_[i] = rng_.Rand8();
dst_tst_[i] = rng_.Rand8();
@@ -211,21 +213,20 @@ TEST_P(BlendA64MaskTest8B, RandomValues) {
}
TEST_P(BlendA64MaskTest8B, ExtremeValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = rng_(2) + 254;
- dst_tst_[i] = rng_(2) + 254;
- src0_[i] = rng_(2) + 254;
- src1_[i] = rng_(2) + 254;
- }
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(2) + 254;
+ dst_tst_[i] = rng_(2) + 254;
+ src0_[i] = rng_(2) + 254;
+ src1_[i] = rng_(2) + 254;
+ }
- for (int i = 0; i < kMaxMaskSize; ++i)
- mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize)
RunTest(bsize, 1);
- }
}
+
TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
const int kRunTimes = 10000000;
for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
@@ -240,10 +241,7 @@ TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunOneTest(bsize, 1, 1, kRunTimes);
- RunOneTest(bsize, 1, 0, kRunTimes);
- RunOneTest(bsize, 0, 1, kRunTimes);
- RunOneTest(bsize, 0, 0, kRunTimes);
+ RunTest(bsize, kRunTimes);
}
}
#if HAVE_SSE4_1
@@ -258,6 +256,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, BlendA64MaskTest8B,
aom_blend_a64_mask_avx2)));
#endif // HAVE_AVX2
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendA64MaskTest8B,
+ ::testing::Values(TestFuncs(aom_blend_a64_mask_c,
+ aom_blend_a64_mask_neon)));
+#endif // HAVE_NEON
+
//////////////////////////////////////////////////////////////////////////////
// 8 bit _d16 version
//////////////////////////////////////////////////////////////////////////////
@@ -275,7 +279,8 @@ class BlendA64MaskTest8B_d16
// max number of bits used by the source
static const int kSrcMaxBitsMask = 0x3fff;
- void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+ int run_times) override {
ConvolveParams conv_params;
conv_params.round_0 = ROUND0_BITS;
conv_params.round_1 = COMPOUND_ROUND1_BITS;
@@ -308,8 +313,7 @@ class BlendA64MaskTest8B_d16
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B_d16);
TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize) {
for (int i = 0; i < kBufSize; ++i) {
dst_ref_[i] = rng_.Rand8();
dst_tst_[i] = rng_.Rand8();
@@ -326,20 +330,35 @@ TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
}
TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = 255;
+ dst_tst_[i] = 255;
+
+ src0_[i] = kSrcMaxBitsMask;
+ src1_[i] = kSrcMaxBitsMask;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize)
+ RunTest(bsize, 1);
+}
+
+TEST_P(BlendA64MaskTest8B_d16, DISABLED_Speed) {
+ const int kRunTimes = 10000000;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = 255;
- dst_tst_[i] = 255;
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
- src0_[i] = kSrcMaxBitsMask;
- src1_[i] = kSrcMaxBitsMask;
+ src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+ src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
}
for (int i = 0; i < kMaxMaskSize; ++i)
- mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunTest(bsize, 1);
+ RunTest(bsize, kRunTimes);
}
}
@@ -377,7 +396,8 @@ typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
protected:
- void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+ int run_times) override {
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < run_times; ++i) {
@@ -409,53 +429,46 @@ class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTestHBD);
TEST_P(BlendA64MaskTestHBD, RandomValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
- switch (rng_(3)) {
- case 0: bit_depth_ = 8; break;
- case 1: bit_depth_ = 10; break;
- default: bit_depth_ = 12; break;
- }
-
+ for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+ bit_depth_ += 2) {
const int hi = 1 << bit_depth_;
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = rng_(hi);
- dst_tst_[i] = rng_(hi);
- src0_[i] = rng_(hi);
- src1_[i] = rng_(hi);
- }
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(hi);
+ dst_tst_[i] = rng_(hi);
+ src0_[i] = rng_(hi);
+ src1_[i] = rng_(hi);
+ }
- for (int i = 0; i < kMaxMaskSize; ++i)
- mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunTest(bsize, 1);
+ RunTest(bsize, 1);
+ }
}
}
TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
- for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
- int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
- switch (rng_(3)) {
- case 0: bit_depth_ = 8; break;
- case 1: bit_depth_ = 10; break;
- default: bit_depth_ = 12; break;
- }
-
+ for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+ bit_depth_ += 2) {
const int hi = 1 << bit_depth_;
const int lo = hi - 2;
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = rng_(hi - lo) + lo;
- dst_tst_[i] = rng_(hi - lo) + lo;
- src0_[i] = rng_(hi - lo) + lo;
- src1_[i] = rng_(hi - lo) + lo;
- }
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure();
+ ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(hi - lo) + lo;
+ dst_tst_[i] = rng_(hi - lo) + lo;
+ src0_[i] = rng_(hi - lo) + lo;
+ src1_[i] = rng_(hi - lo) + lo;
+ }
- for (int i = 0; i < kMaxMaskSize; ++i)
- mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
- RunTest(bsize, 1);
+ RunTest(bsize, 1);
+ }
}
}
@@ -466,6 +479,13 @@ INSTANTIATE_TEST_SUITE_P(
aom_highbd_blend_a64_mask_sse4_1)));
#endif // HAVE_SSE4_1
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64MaskTestHBD,
+ ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+ aom_highbd_blend_a64_mask_neon)));
+#endif // HAVE_NEON
+
//////////////////////////////////////////////////////////////////////////////
// HBD _d16 version
//////////////////////////////////////////////////////////////////////////////
@@ -485,7 +505,8 @@ class BlendA64MaskTestHBD_d16
static const int kSrcMaxBitsMask = (1 << 14) - 1;
static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
- void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+ int run_times) override {
ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test.";
ConvolveParams conv_params;
conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
@@ -526,51 +547,49 @@ class BlendA64MaskTestHBD_d16
TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
if (params_.tst_func == nullptr) return;
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
- switch (rng_(3)) {
- case 0: bit_depth_ = 8; break;
- case 1: bit_depth_ = 10; break;
- default: bit_depth_ = 12; break;
- }
+ for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+ bit_depth_ += 2) {
src_max_bits_mask_ =
(bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = rng_.Rand8();
- dst_tst_[i] = rng_.Rand8();
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure();
+ ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
- src0_[i] = rng_.Rand16() & src_max_bits_mask_;
- src1_[i] = rng_.Rand16() & src_max_bits_mask_;
- }
+ src0_[i] = rng_.Rand16() & src_max_bits_mask_;
+ src1_[i] = rng_.Rand16() & src_max_bits_mask_;
+ }
- for (int i = 0; i < kMaxMaskSize; ++i)
- mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunTest(bsize, 1);
+ RunTest(bsize, 1);
+ }
}
}
-// TODO (Scott LaVarnway), fix this test
-TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) {
- for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
- for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
- src_max_bits_mask_ =
- (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = 0;
- dst_tst_[i] = (1 << bit_depth_) - 1;
+TEST_P(BlendA64MaskTestHBD_d16, ExtremeValues) {
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ src_max_bits_mask_ =
+ (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
- src0_[i] = src_max_bits_mask_;
- src1_[i] = src_max_bits_mask_;
- }
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = 0;
+ dst_tst_[i] = (1 << bit_depth_) - 1;
- for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+ src0_[i] = src_max_bits_mask_;
+ src1_[i] = src_max_bits_mask_;
+ }
+ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
RunTest(bsize, 1);
}
}
}
+
TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
const int kRunTimes = 10000000;
for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
@@ -586,15 +605,15 @@ TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunOneTest(bsize, 1, 1, kRunTimes);
- RunOneTest(bsize, 0, 0, kRunTimes);
+ RunTest(bsize, kRunTimes);
}
}
}
-INSTANTIATE_TEST_SUITE_P(C, BlendA64MaskTestHBD_d16,
- ::testing::Values(TestFuncsHBD_d16(
- aom_highbd_blend_a64_d16_mask_c, nullptr)));
+INSTANTIATE_TEST_SUITE_P(
+ C, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_c)));
#if HAVE_SSE4_1
INSTANTIATE_TEST_SUITE_P(
@@ -610,6 +629,13 @@ INSTANTIATE_TEST_SUITE_P(
aom_highbd_blend_a64_d16_mask_avx2)));
#endif // HAVE_AVX2
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_neon)));
+#endif // HAVE_NEON
+
// TODO(slavarnway): Enable the following in the avx2 commit. (56501)
#if 0
#if HAVE_AVX2
diff --git a/test/block_test.cc b/test/block_test.cc
index 74deee3f5..686180cf8 100644
--- a/test/block_test.cc
+++ b/test/block_test.cc
@@ -140,9 +140,9 @@ class SuperBlockSizeTestLarge
superblock_size_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
sb_size_violated_ = false;
}
- virtual ~SuperBlockSizeTestLarge() {}
+ ~SuperBlockSizeTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -152,10 +152,10 @@ class SuperBlockSizeTestLarge
cfg_.rc_target_bitrate = 1000;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -163,8 +163,8 @@ class SuperBlockSizeTestLarge
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec &&
superblock_size_ != AOM_SUPERBLOCK_SIZE_DYNAMIC) {
diff --git a/test/borders_test.cc b/test/borders_test.cc
index bf9cc8b1a..594c3e842 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -24,12 +24,12 @@ class BordersTestLarge
public ::libaom_test::EncoderTest {
protected:
BordersTestLarge() : EncoderTest(GET_PARAM(0)) {}
- virtual ~BordersTestLarge() {}
+ ~BordersTestLarge() override = default;
- virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 1);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -38,7 +38,7 @@ class BordersTestLarge
}
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
if (pkt->data.frame.flags & AOM_FRAME_IS_KEY) {
}
}
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
index 3f971be43..5959dabd4 100644
--- a/test/cdef_test.cc
+++ b/test/cdef_test.cc
@@ -38,8 +38,8 @@ typedef std::tuple<CdefFilterBlockFunctions, CdefFilterBlockFunctions,
class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
public:
- virtual ~CDEFBlockTest() {}
- virtual void SetUp() {
+ ~CDEFBlockTest() override = default;
+ void SetUp() override {
cdef = GET_PARAM(0);
ref_cdef = GET_PARAM(1);
bsize = GET_PARAM(2);
@@ -47,10 +47,8 @@ class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
depth = GET_PARAM(4);
}
- virtual void TearDown() {}
-
protected:
- int bsize;
+ BLOCK_SIZE bsize;
int boundary;
int depth;
CdefFilterBlockFunctions cdef;
@@ -67,7 +65,8 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedTest);
typedef CDEFBlockTest CDEFSpeedHighbdTest;
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedHighbdTest);
-int64_t test_cdef(int bsize, int iterations, CdefFilterBlockFunctions cdef,
+int64_t test_cdef(BLOCK_SIZE bsize, int iterations,
+ CdefFilterBlockFunctions cdef,
CdefFilterBlockFunctions ref_cdef, int boundary, int depth) {
aom_usec_timer ref_timer;
int64_t ref_elapsed_time = 0;
@@ -188,7 +187,8 @@ int64_t test_cdef(int bsize, int iterations, CdefFilterBlockFunctions cdef,
return ref_elapsed_time;
}
-void test_cdef_speed(int bsize, int iterations, CdefFilterBlockFunctions cdef,
+void test_cdef_speed(BLOCK_SIZE bsize, int iterations,
+ CdefFilterBlockFunctions cdef,
CdefFilterBlockFunctions ref_cdef, int boundary,
int depth) {
int64_t ref_elapsed_time =
@@ -213,14 +213,12 @@ typedef std::tuple<find_dir_t, find_dir_t> find_dir_param_t;
class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
public:
- virtual ~CDEFFindDirTest() {}
- virtual void SetUp() {
+ ~CDEFFindDirTest() override = default;
+ void SetUp() override {
finddir = GET_PARAM(0);
ref_finddir = GET_PARAM(1);
}
- virtual void TearDown() {}
-
protected:
find_dir_t finddir;
find_dir_t ref_finddir;
@@ -304,14 +302,12 @@ typedef std::tuple<find_dir_dual_t, find_dir_dual_t> find_dir_dual_param_t;
class CDEFFindDirDualTest
: public ::testing::TestWithParam<find_dir_dual_param_t> {
public:
- virtual ~CDEFFindDirDualTest() {}
- virtual void SetUp() {
+ ~CDEFFindDirDualTest() override = default;
+ void SetUp() override {
finddir = GET_PARAM(0);
ref_finddir = GET_PARAM(1);
}
- virtual void TearDown() {}
-
protected:
find_dir_dual_t finddir;
find_dir_dual_t ref_finddir;
@@ -405,6 +401,175 @@ void test_finddir_dual_speed(
ref_elapsed_time, elapsed_time, ref_elapsed_time / elapsed_time);
}
+#define MAX_CDEF_BLOCK 256
+
+constexpr int kIterations = 100;
+
+using CDEFCopyRect8To16 = void (*)(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride, int width,
+ int height);
+
+using CDEFCopyRect8To16Param = std::tuple<CDEFCopyRect8To16, CDEFCopyRect8To16>;
+
+class CDEFCopyRect8to16Test
+ : public ::testing::TestWithParam<CDEFCopyRect8To16Param> {
+ public:
+ CDEFCopyRect8to16Test()
+ : rnd_(libaom_test::ACMRandom::DeterministicSeed()),
+ test_func_(GET_PARAM(0)), ref_func_(GET_PARAM(1)) {}
+ ~CDEFCopyRect8to16Test() override = default;
+ void SetUp() override {
+ src_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(8, sizeof(uint8_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(src_, nullptr);
+ ref_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(ref_dst_, nullptr);
+ test_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(test_dst_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(ref_dst_);
+ aom_free(test_dst_);
+ }
+
+ void test_copy_rect_8_to_16(CDEFCopyRect8To16 test_func,
+ CDEFCopyRect8To16 ref_func) {
+ constexpr int stride = MAX_CDEF_BLOCK;
+ int error = 0;
+ for (int k = 0; k < kIterations && !error; k++) {
+ // Generate a random value between 1 and 256, making sure height is even.
+ // Test once for very small values to avoid potential overflows.
+ const int width = k == 0 ? 2 : rnd_.Rand8() % 256 + 1;
+ const int height = k == 0 ? 2 : (rnd_.Rand8() % 128 + 1) * 2;
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ src_[i * stride + j] = rnd_.Rand8();
+ }
+ }
+
+ ref_func(ref_dst_, stride, src_, stride, width, height);
+ test_func(test_dst_, stride, src_, stride, width, height);
+
+ int i, j;
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ if (test_dst_[i * stride + j] != ref_dst_[i * stride + j]) {
+ error = 1;
+ break;
+ }
+ }
+ if (error) {
+ break;
+ }
+ }
+ EXPECT_EQ(0, error)
+ << "Error: CDEFCopyRect8to16Test, SIMD and C mismatch." << std::endl
+ << "First error at " << i << "," << j << " ("
+ << ref_dst_[i * stride + j] << " : " << test_dst_[i * stride + j]
+ << ") " << std::endl
+ << "width: " << width << std::endl
+ << "height: " << height << std::endl
+ << std::endl;
+ }
+ }
+
+ protected:
+ libaom_test::ACMRandom rnd_;
+ uint8_t *src_;
+ uint16_t *ref_dst_;
+ uint16_t *test_dst_;
+ CDEFCopyRect8To16 test_func_;
+ CDEFCopyRect8To16 ref_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFCopyRect8to16Test);
+
+using CDEFCopyRect16To16 = void (*)(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride, int width,
+ int height);
+
+using CDEFCopyRect16To16Param =
+ std::tuple<CDEFCopyRect16To16, CDEFCopyRect16To16>;
+
+class CDEFCopyRect16to16Test
+ : public ::testing::TestWithParam<CDEFCopyRect16To16Param> {
+ public:
+ CDEFCopyRect16to16Test()
+ : rnd_(libaom_test::ACMRandom::DeterministicSeed()),
+ test_func_(GET_PARAM(0)), ref_func_(GET_PARAM(1)) {}
+ ~CDEFCopyRect16to16Test() override = default;
+ void SetUp() override {
+ src_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(src_, nullptr);
+ ref_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(ref_dst_, nullptr);
+ test_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(test_dst_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(ref_dst_);
+ aom_free(test_dst_);
+ }
+
+ void test_copy_rect_16_to_16(CDEFCopyRect16To16 test_func,
+ CDEFCopyRect16To16 ref_func) {
+ constexpr int stride = MAX_CDEF_BLOCK;
+ int error = 0;
+ for (int k = 0; k < kIterations && !error; k++) {
+ // Generate a random value between 1 and 256, making sure height is even.
+ // Test once for very small values to avoid potential overflows.
+ const int width = k == 0 ? 2 : rnd_.Rand8() % 256 + 1;
+ const int height = k == 0 ? 2 : (rnd_.Rand8() % 128 + 1) * 2;
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ src_[i * stride + j] = rnd_.Rand16();
+ }
+ }
+
+ ref_func(ref_dst_, stride, src_, stride, width, height);
+ test_func(test_dst_, stride, src_, stride, width, height);
+
+ int i, j;
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ if (test_dst_[i * stride + j] != ref_dst_[i * stride + j]) {
+ error = 1;
+ break;
+ }
+ }
+ if (error) {
+ break;
+ }
+ }
+ EXPECT_EQ(0, error)
+ << "Error: CDEFCopyRect16to16Test, SIMD and C mismatch." << std::endl
+ << "First error at " << i << "," << j << " ("
+ << ref_dst_[i * stride + j] << " : " << test_dst_[i * stride + j]
+ << ") " << std::endl
+ << "width: " << width << std::endl
+ << "height: " << height << std::endl
+ << std::endl;
+ }
+ }
+
+ protected:
+ libaom_test::ACMRandom rnd_;
+ uint16_t *src_;
+ uint16_t *ref_dst_;
+ uint16_t *test_dst_;
+ CDEFCopyRect16To16 test_func_;
+ CDEFCopyRect16To16 ref_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFCopyRect16to16Test);
+
TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
}
@@ -437,6 +602,14 @@ TEST_P(CDEFFindDirDualSpeedTest, DISABLED_TestSpeed) {
test_finddir_dual_speed(finddir, ref_finddir);
}
+TEST_P(CDEFCopyRect8to16Test, TestSIMDNoMismatch) {
+ test_copy_rect_8_to_16(test_func_, ref_func_);
+}
+
+TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
+ test_copy_rect_16_to_16(test_func_, ref_func_);
+}
+
using std::make_tuple;
#if (HAVE_SSE2 || HAVE_SSSE3 || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
@@ -482,6 +655,16 @@ INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest,
INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualTest,
::testing::Values(make_tuple(&cdef_find_dir_dual_sse2,
&cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_sse2)));
#endif
#if HAVE_SSSE3
@@ -515,6 +698,16 @@ INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirTest,
INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualTest,
::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
&cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_ssse3)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_ssse3)));
#endif
#if HAVE_SSE4_1
@@ -549,6 +742,16 @@ INSTANTIATE_TEST_SUITE_P(
SSE4_1, CDEFFindDirDualTest,
::testing::Values(make_tuple(&cdef_find_dir_dual_sse4_1,
&cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_sse4_1)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_sse4_1)));
#endif
#if HAVE_AVX2
@@ -582,6 +785,16 @@ INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirTest,
INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirDualTest,
::testing::Values(make_tuple(&cdef_find_dir_dual_avx2,
&cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_avx2)));
#endif
#if HAVE_NEON
@@ -615,6 +828,16 @@ INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirTest,
INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualTest,
::testing::Values(make_tuple(&cdef_find_dir_dual_neon,
&cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_neon)));
#endif
// Test speed for all supported architectures
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 97533da5b..7fdea04c3 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -88,7 +88,7 @@ static void printSpeed(int ref_elapsed_time, int elapsed_time, int width,
class CFLTest {
public:
- virtual ~CFLTest() {}
+ virtual ~CFLTest() = default;
void init(TX_SIZE tx) {
tx_size = tx;
width = tx_size_wide[tx_size];
@@ -106,7 +106,7 @@ class CFLTest {
template <typename I>
class CFLTestWithData : public CFLTest {
public:
- virtual ~CFLTestWithData() {}
+ ~CFLTestWithData() override = default;
protected:
I data[CFL_BUF_SQUARE];
@@ -125,7 +125,7 @@ class CFLTestWithData : public CFLTest {
template <typename I>
class CFLTestWithAlignedData : public CFLTest {
public:
- ~CFLTestWithAlignedData() {
+ ~CFLTestWithAlignedData() override {
aom_free(chroma_pels_ref);
aom_free(sub_luma_pels_ref);
aom_free(chroma_pels);
@@ -177,12 +177,12 @@ typedef std::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
public CFLTestWithData<int16_t> {
public:
- virtual void SetUp() {
+ void SetUp() override {
CFLTest::init(std::get<0>(this->GetParam()));
sub_avg = std::get<1>(this->GetParam())(tx_size);
sub_avg_ref = cfl_get_subtract_average_fn_c(tx_size);
}
- virtual ~CFLSubAvgTest() {}
+ ~CFLSubAvgTest() override = default;
protected:
cfl_subtract_average_fn sub_avg;
@@ -223,7 +223,7 @@ template <typename S, typename T, typename I>
class CFLSubsampleTest : public ::testing::TestWithParam<S>,
public CFLTestWithData<I> {
public:
- virtual void SetUp() {
+ void SetUp() override {
CFLTest::init(std::get<0>(this->GetParam()));
fun_420 = std::get<1>(this->GetParam())(this->tx_size);
fun_422 = std::get<2>(this->GetParam())(this->tx_size);
@@ -284,8 +284,8 @@ class CFLSubsampleLBDTest
: public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
uint8_t> {
public:
- virtual ~CFLSubsampleLBDTest() {}
- virtual void SetUp() {
+ ~CFLSubsampleLBDTest() override = default;
+ void SetUp() override {
CFLSubsampleTest::SetUp();
fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
@@ -328,8 +328,8 @@ class CFLSubsampleHBDTest
: public CFLSubsampleTest<subsample_hbd_param, cfl_subsample_hbd_fn,
uint16_t> {
public:
- virtual ~CFLSubsampleHBDTest() {}
- virtual void SetUp() {
+ ~CFLSubsampleHBDTest() override = default;
+ void SetUp() override {
CFLSubsampleTest::SetUp();
fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size);
@@ -369,13 +369,13 @@ typedef std::tuple<TX_SIZE, get_predict_fn> predict_param;
class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
public CFLTestWithAlignedData<uint8_t> {
public:
- virtual void SetUp() {
+ void SetUp() override {
CFLTest::init(std::get<0>(this->GetParam()));
CFLTestWithAlignedData::init();
predict = std::get<1>(this->GetParam())(tx_size);
predict_ref = cfl_get_predict_lbd_fn_c(tx_size);
}
- virtual ~CFLPredictTest() {}
+ ~CFLPredictTest() override = default;
protected:
cfl_predict_lbd_fn predict;
@@ -418,13 +418,13 @@ typedef std::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
public CFLTestWithAlignedData<uint16_t> {
public:
- virtual void SetUp() {
+ void SetUp() override {
CFLTest::init(std::get<0>(this->GetParam()));
CFLTestWithAlignedData::init();
predict = std::get<1>(this->GetParam())(tx_size);
predict_ref = cfl_get_predict_hbd_fn_c(tx_size);
}
- virtual ~CFLPredictHBDTest() {}
+ ~CFLPredictHBDTest() override = default;
protected:
cfl_predict_hbd_fn predict;
diff --git a/test/cnn_test.cc b/test/cnn_test.cc
index 77d8d5562..127ed3d84 100644
--- a/test/cnn_test.cc
+++ b/test/cnn_test.cc
@@ -2520,7 +2520,7 @@ typedef libaom_test::FuncParam<CNNConvolveNoMaxpoolPaddingValidFunc>
class CNNConvolveTest : public ::testing::TestWithParam<CNNConvolveTestFuncs> {
protected:
- virtual void SetUp() { params_ = GetParam(); }
+ void SetUp() override { params_ = GetParam(); }
void RunCNNConvolveSetup(int run_times) {
int in_width = 65;
diff --git a/test/codec_factory.h b/test/codec_factory.h
index d768d2e31..7ffc465a7 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -32,9 +32,9 @@ const int kCodecFactoryParam = 0;
class CodecFactory {
public:
- CodecFactory() {}
+ CodecFactory() = default;
- virtual ~CodecFactory() {}
+ virtual ~CodecFactory() = default;
virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const = 0;
@@ -95,7 +95,7 @@ class AV1Decoder : public Decoder {
: Decoder(cfg, flag) {}
protected:
- virtual aom_codec_iface_t *CodecInterface() const {
+ aom_codec_iface_t *CodecInterface() const override {
#if CONFIG_AV1_DECODER
return aom_codec_av1_dx();
#else
@@ -111,7 +111,7 @@ class AV1Encoder : public Encoder {
: Encoder(cfg, init_flags, stats) {}
protected:
- virtual aom_codec_iface_t *CodecInterface() const {
+ aom_codec_iface_t *CodecInterface() const override {
#if CONFIG_AV1_ENCODER
return aom_codec_av1_cx();
#else
@@ -124,12 +124,12 @@ class AV1CodecFactory : public CodecFactory {
public:
AV1CodecFactory() : CodecFactory() {}
- virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const {
+ Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const override {
return CreateDecoder(cfg, 0);
}
- virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
- const aom_codec_flags_t flags) const {
+ Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+ const aom_codec_flags_t flags) const override {
#if CONFIG_AV1_DECODER
return new AV1Decoder(cfg, flags);
#else
@@ -139,9 +139,9 @@ class AV1CodecFactory : public CodecFactory {
#endif
}
- virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
- const aom_codec_flags_t init_flags,
- TwopassStatsStore *stats) const {
+ Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+ const aom_codec_flags_t init_flags,
+ TwopassStatsStore *stats) const override {
#if CONFIG_AV1_ENCODER
return new AV1Encoder(cfg, init_flags, stats);
#else
@@ -152,8 +152,8 @@ class AV1CodecFactory : public CodecFactory {
#endif
}
- virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
- unsigned int usage) const {
+ aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+ unsigned int usage) const override {
#if CONFIG_AV1_ENCODER
return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage);
#else
diff --git a/test/coding_path_sync.cc b/test/coding_path_sync.cc
index c3e51fd56..f7b7eace9 100644
--- a/test/coding_path_sync.cc
+++ b/test/coding_path_sync.cc
@@ -120,9 +120,9 @@ class CompressedSource {
int width_, height_;
};
-// lowers an aom_image_t to a easily comparable/printable form
-std::vector<int16_t> Serialize(const aom_image_t *img) {
- std::vector<int16_t> bytes;
+// lowers an aom_image_t to an easily comparable/printable form
+std::vector<uint16_t> Serialize(const aom_image_t *img) {
+ std::vector<uint16_t> bytes;
bytes.reserve(img->d_w * img->d_h * 3);
for (int plane = 0; plane < 3; ++plane) {
const int w = aom_img_plane_width(img, plane);
@@ -130,11 +130,13 @@ std::vector<int16_t> Serialize(const aom_image_t *img) {
for (int r = 0; r < h; ++r) {
for (int c = 0; c < w; ++c) {
- unsigned char *row = img->planes[plane] + r * img->stride[plane];
- if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
- bytes.push_back(row[c * 2]);
- else
+ const unsigned char *row = img->planes[plane] + r * img->stride[plane];
+ if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ const uint16_t *row16 = reinterpret_cast<const uint16_t *>(row);
+ bytes.push_back(row16[c]);
+ } else {
bytes.push_back(row[c]);
+ }
}
}
}
@@ -155,7 +157,7 @@ class Decoder {
~Decoder() { aom_codec_destroy(&dec_); }
- std::vector<int16_t> decode(const aom_codec_cx_pkt_t *pkt) {
+ std::vector<uint16_t> decode(const aom_codec_cx_pkt_t *pkt) {
aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
pkt->data.frame.sz, nullptr);
@@ -179,8 +181,8 @@ TEST(CodingPathSync, SearchForHbdLbdMismatch) {
for (int k = 0; k < 3; ++k) {
const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
- std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
- std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+ std::vector<uint16_t> lbd_yuv = dec_lbd.decode(frame);
+ std::vector<uint16_t> hbd_yuv = dec_hbd.decode(frame);
ASSERT_EQ(lbd_yuv, hbd_yuv);
}
@@ -199,8 +201,8 @@ TEST(CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) {
for (int k = 0; k < 5; ++k) {
const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
- std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
- std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+ std::vector<uint16_t> lbd_yuv = dec_lbd.decode(frame);
+ std::vector<uint16_t> hbd_yuv = dec_hbd.decode(frame);
ASSERT_EQ(lbd_yuv, hbd_yuv);
}
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 4218ac316..2f81d7e9b 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -13,9 +13,12 @@
using libaom_test::ACMRandom;
using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgParam;
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
+using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdCompAvgTest);
#if CONFIG_AV1_HIGHBITDEPTH
using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
@@ -26,6 +29,19 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
using std::make_tuple;
using std::tuple;
+uint8_t *DistWtdCompAvgTest::reference_data_ = nullptr;
+uint8_t *DistWtdCompAvgTest::second_pred_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred_test_ = nullptr;
+uint8_t *DistWtdCompAvgTest::reference_data8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::second_pred8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred8_test_ = nullptr;
+uint16_t *DistWtdCompAvgTest::reference_data16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::second_pred16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::comp_pred16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::comp_pred16_test_ = nullptr;
+
namespace {
TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
@@ -52,6 +68,141 @@ INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_dist_wtd_comp_avg_upsampled_pred_neon));
+#endif // HAVE_NEON
+
+TEST_P(DistWtdCompAvgTest, MaxRef) {
+ FillConstant(reference_data_, reference_stride_, mask_);
+ FillConstant(second_pred_, width_, 0);
+ CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
+ FillConstant(reference_data_, reference_stride_, 0);
+ FillConstant(second_pred_, width_, mask_);
+ CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckCompAvg();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdCompAvgTest, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckCompAvg();
+ reference_stride_ = tmp_stride;
+}
+
+// TODO(chengchen): add highbd tests
+const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
+
+#if HAVE_SSSE3
+const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
+#endif // HAVE_SSSE3
+
+#if HAVE_NEON
+const DistWtdCompAvgParam dist_wtd_comp_avg_neon_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_neon_tests));
+#endif // HAVE_NEON
+
#if CONFIG_AV1_HIGHBITDEPTH
TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
RunSpeedTest(GET_PARAM(1));
@@ -67,6 +218,12 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_pred_neon, 1));
+#endif
+
TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
RunSpeedTest(GET_PARAM(1));
}
@@ -80,6 +237,13 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon));
+#endif
+
#endif // CONFIG_AV1_HIGHBITDEPTH
} // namespace
diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index c1526d80b..396df2e2d 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h
@@ -40,11 +40,18 @@ typedef void (*distwtdcompavgupsampled_func)(
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+
typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
DISTWTDCOMPAVGUPSAMPLEDParam;
+typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+
#if CONFIG_AV1_HIGHBITDEPTH
typedef void (*highbddistwtdcompavgupsampled_func)(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
@@ -90,8 +97,8 @@ BuildParams(highbddistwtdcompavgupsampled_func filter) {
class AV1DISTWTDCOMPAVGTest
: public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
public:
- ~AV1DISTWTDCOMPAVGTest() {}
- void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+ ~AV1DISTWTDCOMPAVGTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
protected:
void RunCheckOutput(distwtdcompavg_func test_impl) {
@@ -193,8 +200,8 @@ class AV1DISTWTDCOMPAVGTest
class AV1DISTWTDCOMPAVGUPSAMPLEDTest
: public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
public:
- ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
- void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+ ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
protected:
void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
@@ -317,12 +324,198 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
libaom_test::ACMRandom rnd_;
}; // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+class DistWtdCompAvgTest
+ : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
+ public ::testing::Test {
+ public:
+ DistWtdCompAvgTest()
+ : width_(GET_PARAM(0)), height_(GET_PARAM(1)), bd_(GET_PARAM(3)) {}
+
+ static void SetUpTestSuite() {
+ reference_data8_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBufferSize));
+ ASSERT_NE(reference_data8_, nullptr);
+ second_pred8_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(second_pred8_, nullptr);
+ comp_pred8_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(comp_pred8_, nullptr);
+ comp_pred8_test_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(comp_pred8_test_, nullptr);
+ reference_data16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+ ASSERT_NE(reference_data16_, nullptr);
+ second_pred16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(second_pred16_, nullptr);
+ comp_pred16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(comp_pred16_, nullptr);
+ comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(comp_pred16_test_, nullptr);
+ }
+
+ static void TearDownTestSuite() {
+ aom_free(reference_data8_);
+ reference_data8_ = nullptr;
+ aom_free(second_pred8_);
+ second_pred8_ = nullptr;
+ aom_free(comp_pred8_);
+ comp_pred8_ = nullptr;
+ aom_free(comp_pred8_test_);
+ comp_pred8_test_ = nullptr;
+ aom_free(reference_data16_);
+ reference_data16_ = nullptr;
+ aom_free(second_pred16_);
+ second_pred16_ = nullptr;
+ aom_free(comp_pred16_);
+ comp_pred16_ = nullptr;
+ aom_free(comp_pred16_test_);
+ comp_pred16_test_ = nullptr;
+ }
+
+ protected:
+ // Handle up to 4 128x128 blocks, with stride up to 256
+ static const int kDataAlignment = 16;
+ static const int kDataBlockSize = 128 * 256;
+ static const int kDataBufferSize = 4 * kDataBlockSize;
+
+ void SetUp() override {
+ if (bd_ == -1) {
+ use_high_bit_depth_ = false;
+ bit_depth_ = AOM_BITS_8;
+ reference_data_ = reference_data8_;
+ second_pred_ = second_pred8_;
+ comp_pred_ = comp_pred8_;
+ comp_pred_test_ = comp_pred8_test_;
+ } else {
+ use_high_bit_depth_ = true;
+ bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
+ reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+ second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+ comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+ comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
+ }
+ mask_ = (1 << bit_depth_) - 1;
+ reference_stride_ = width_ * 2;
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ virtual uint8_t *GetReference(int block_idx) {
+ if (use_high_bit_depth_)
+ return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+ block_idx * kDataBlockSize);
+ return reference_data_ + block_idx * kDataBlockSize;
+ }
+
+ void ReferenceDistWtdCompAvg(int block_idx) {
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const second_pred8 = second_pred_;
+ uint8_t *const comp_pred8 = comp_pred_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+ uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ const int tmp =
+ second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+ reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+ comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+ } else {
+ const int tmp =
+ second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+ reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+ comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+ }
+ }
+ }
+ }
+
+ void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+ } else {
+ data16[h * stride + w] = fill_constant;
+ }
+ }
+ }
+ }
+
+ void FillRandom(uint8_t *data, int stride) {
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = rnd_.Rand8();
+ } else {
+ data16[h * stride + w] = rnd_.Rand16() & mask_;
+ }
+ }
+ }
+ }
+
+ void dist_wtd_comp_avg(int block_idx) {
+ const uint8_t *const reference = GetReference(block_idx);
+
+ API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+ height_, reference, reference_stride_,
+ &jcp_param_));
+ }
+
+ void CheckCompAvg() {
+ for (int j = 0; j < 2; ++j) {
+ for (int i = 0; i < 4; ++i) {
+ jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+ jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
+
+ ReferenceDistWtdCompAvg(0);
+ dist_wtd_comp_avg(0);
+
+ for (int y = 0; y < height_; ++y)
+ for (int x = 0; x < width_; ++x)
+ ASSERT_EQ(comp_pred_[y * width_ + x],
+ comp_pred_test_[y * width_ + x]);
+ }
+ }
+ }
+
+ int width_, height_, mask_, bd_;
+ aom_bit_depth_t bit_depth_;
+ static uint8_t *reference_data_;
+ static uint8_t *second_pred_;
+ bool use_high_bit_depth_;
+ static uint8_t *reference_data8_;
+ static uint8_t *second_pred8_;
+ static uint16_t *reference_data16_;
+ static uint16_t *second_pred16_;
+ int reference_stride_;
+ static uint8_t *comp_pred_;
+ static uint8_t *comp_pred8_;
+ static uint16_t *comp_pred16_;
+ static uint8_t *comp_pred_test_;
+ static uint8_t *comp_pred8_test_;
+ static uint16_t *comp_pred16_test_;
+ DIST_WTD_COMP_PARAMS jcp_param_;
+
+ ACMRandom rnd_;
+};
+
#if CONFIG_AV1_HIGHBITDEPTH
class AV1HighBDDISTWTDCOMPAVGTest
: public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
public:
- ~AV1HighBDDISTWTDCOMPAVGTest() {}
- void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+ ~AV1HighBDDISTWTDCOMPAVGTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
protected:
void RunCheckOutput(distwtdcompavg_func test_impl) {
@@ -430,8 +623,8 @@ class AV1HighBDDISTWTDCOMPAVGTest
class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
: public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
public:
- ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {}
- void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+ ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
protected:
void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
diff --git a/test/comp_mask_pred_test.cc b/test/comp_mask_pred_test.cc
index 06c319203..b65730aa5 100644
--- a/test/comp_mask_pred_test.cc
+++ b/test/comp_mask_pred_test.cc
@@ -48,10 +48,10 @@ const BLOCK_SIZE kCompMaskPredParams[] = {
class AV1CompMaskPredBase : public ::testing::Test {
public:
- ~AV1CompMaskPredBase();
- void SetUp();
+ ~AV1CompMaskPredBase() override;
+ void SetUp() override;
- void TearDown();
+ void TearDown() override;
protected:
bool CheckResult(int width, int height) {
@@ -76,7 +76,7 @@ class AV1CompMaskPredBase : public ::testing::Test {
uint8_t *ref_;
};
-AV1CompMaskPredBase::~AV1CompMaskPredBase() {}
+AV1CompMaskPredBase::~AV1CompMaskPredBase() = default;
void AV1CompMaskPredBase::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -303,10 +303,10 @@ typedef std::tuple<comp_avg_pred_func, BLOCK_SIZE> CompAvgPredParam;
class AV1CompAvgPredTest : public ::testing::TestWithParam<CompAvgPredParam> {
public:
- ~AV1CompAvgPredTest();
- void SetUp();
+ ~AV1CompAvgPredTest() override;
+ void SetUp() override;
- void TearDown();
+ void TearDown() override;
protected:
void RunCheckOutput(comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
@@ -333,7 +333,7 @@ class AV1CompAvgPredTest : public ::testing::TestWithParam<CompAvgPredParam> {
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompAvgPredTest);
-AV1CompAvgPredTest::~AV1CompAvgPredTest() {}
+AV1CompAvgPredTest::~AV1CompAvgPredTest() = default;
void AV1CompAvgPredTest::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -390,7 +390,7 @@ void AV1CompAvgPredTest::RunSpeedTest(comp_avg_pred_func test_impl,
const double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
elapsed_time[i] = 1000.0 * time;
}
- printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+ printf("CompAvgPred %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
elapsed_time[1]);
printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
}
@@ -420,10 +420,10 @@ INSTANTIATE_TEST_SUITE_P(
#if CONFIG_AV1_HIGHBITDEPTH
class AV1HighbdCompMaskPredTestBase : public ::testing::Test {
public:
- ~AV1HighbdCompMaskPredTestBase();
- void SetUp();
+ ~AV1HighbdCompMaskPredTestBase() override;
+ void SetUp() override;
- void TearDown();
+ void TearDown() override;
protected:
bool CheckResult(int width, int height) {
@@ -448,7 +448,7 @@ class AV1HighbdCompMaskPredTestBase : public ::testing::Test {
uint16_t *ref_;
};
-AV1HighbdCompMaskPredTestBase::~AV1HighbdCompMaskPredTestBase() {}
+AV1HighbdCompMaskPredTestBase::~AV1HighbdCompMaskPredTestBase() = default;
void AV1HighbdCompMaskPredTestBase::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -494,14 +494,14 @@ class AV1HighbdCompMaskPredTest
: public AV1HighbdCompMaskPredTestBase,
public ::testing::WithParamInterface<HighbdCompMaskPredParam> {
public:
- ~AV1HighbdCompMaskPredTest();
+ ~AV1HighbdCompMaskPredTest() override;
protected:
void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
};
-AV1HighbdCompMaskPredTest::~AV1HighbdCompMaskPredTest() {}
+AV1HighbdCompMaskPredTest::~AV1HighbdCompMaskPredTest() = default;
void AV1HighbdCompMaskPredTest::RunCheckOutput(
highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
@@ -583,6 +583,14 @@ TEST_P(AV1HighbdCompMaskPredTest, DISABLED_Speed) {
RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
}
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdCompMaskPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_neon),
+ ::testing::ValuesIn(kCompMaskPredParams),
+ ::testing::Range(8, 13, 2)));
+#endif
+
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1HighbdCompMaskPredTest,
@@ -612,7 +620,7 @@ class AV1HighbdUpsampledPredTest
: public AV1HighbdCompMaskPredTestBase,
public ::testing::WithParamInterface<HighbdUpsampledPredParam> {
public:
- ~AV1HighbdUpsampledPredTest();
+ ~AV1HighbdUpsampledPredTest() override;
protected:
void RunCheckOutput(highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize);
@@ -620,7 +628,7 @@ class AV1HighbdUpsampledPredTest
int havSub);
};
-AV1HighbdUpsampledPredTest::~AV1HighbdUpsampledPredTest() {}
+AV1HighbdUpsampledPredTest::~AV1HighbdUpsampledPredTest() = default;
void AV1HighbdUpsampledPredTest::RunCheckOutput(
highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize) {
@@ -712,5 +720,137 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Range(8, 13, 2)));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdUpsampledPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_upsampled_pred_neon),
+ ::testing::ValuesIn(kValidBlockSize),
+ ::testing::Range(8, 13, 2)));
+#endif
+
+typedef void (*highbd_comp_avg_pred_func)(uint8_t *comp_pred,
+ const uint8_t *pred, int width,
+ int height, const uint8_t *ref,
+ int ref_stride);
+
+typedef std::tuple<highbd_comp_avg_pred_func, BLOCK_SIZE, int>
+ HighbdCompAvgPredParam;
+
+class AV1HighbdCompAvgPredTest
+ : public ::testing::TestWithParam<HighbdCompAvgPredParam> {
+ public:
+ ~AV1HighbdCompAvgPredTest() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+ void RunSpeedTest(highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+ bool CheckResult(int width, int height) const {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const int idx = y * width + x;
+ if (comp_pred1_[idx] != comp_pred2_[idx]) {
+ printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, x, y);
+ printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ uint16_t *comp_pred1_;
+ uint16_t *comp_pred2_;
+ uint16_t *pred_;
+ uint16_t *ref_;
+};
+
+AV1HighbdCompAvgPredTest::~AV1HighbdCompAvgPredTest() {
+ aom_free(comp_pred1_);
+ aom_free(comp_pred2_);
+ aom_free(pred_);
+ aom_free(ref_);
+}
+
+void AV1HighbdCompAvgPredTest::SetUp() {
+ int bd_ = GET_PARAM(2);
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+
+ comp_pred1_ =
+ (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+ ASSERT_NE(comp_pred1_, nullptr);
+ comp_pred2_ =
+ (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+ ASSERT_NE(comp_pred2_, nullptr);
+ pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+ ASSERT_NE(pred_, nullptr);
+ ref_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*ref_));
+ ASSERT_NE(ref_, nullptr);
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ ref_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+}
+
+void AV1HighbdCompAvgPredTest::RunCheckOutput(
+ highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(comp_pred1_),
+ CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+ test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+
+ ASSERT_EQ(CheckResult(w, h), true);
+}
+
+void AV1HighbdCompAvgPredTest::RunSpeedTest(highbd_comp_avg_pred_func test_impl,
+ BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int num_loops = 1000000000 / (w + h);
+
+ highbd_comp_avg_pred_func functions[2] = { aom_highbd_comp_avg_pred_c,
+ test_impl };
+ double elapsed_time[2] = { 0.0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ highbd_comp_avg_pred_func func = functions[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time;
+ }
+ printf("HighbdCompAvg %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+ elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdCompAvgPredTest);
+
+TEST_P(AV1HighbdCompAvgPredTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdCompAvgPredTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdCompAvgPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_avg_pred_neon),
+ ::testing::ValuesIn(kValidBlockSize),
+ ::testing::Range(8, 13, 2)));
+#endif
+
#endif // CONFIG_AV1_HIGHBITDEPTH
} // namespace
diff --git a/test/convolve_round_test.cc b/test/convolve_round_test.cc
deleted file mode 100644
index 05807441c..000000000
--- a/test/convolve_round_test.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/aom_timer.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-#define CONVOLVE_ROUNDING_PARAM \
- const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, \
- int h, int bits
-
-typedef void (*ConvolveRoundFunc)(CONVOLVE_ROUNDING_PARAM);
-
-typedef void (*ConvolveRoundFuncHbd)(CONVOLVE_ROUNDING_PARAM, int bd);
-
-template <ConvolveRoundFuncHbd fn>
-void highbd_convolve_rounding_8(CONVOLVE_ROUNDING_PARAM) {
- const int bd = 8;
- fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
-}
-
-template <ConvolveRoundFuncHbd fn>
-void highbd_convolve_rounding_10(CONVOLVE_ROUNDING_PARAM) {
- const int bd = 10;
- fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
-}
-
-template <ConvolveRoundFuncHbd fn>
-void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) {
- const int bd = 12;
- fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
-}
-
-typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
-
-using std::tuple;
-
-typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
- ConvolveRoundParam;
-
-const int kTestNum = 5000;
-
-class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
- protected:
- ConvolveRoundTest()
- : func_ref_(GET_PARAM(0)), func_(GET_PARAM(1)), data_path_(GET_PARAM(2)) {
- }
- virtual ~ConvolveRoundTest() {}
-
- virtual void SetUp() {
- const size_t block_size = 128 * 128;
- src_ = reinterpret_cast<int32_t *>(
- aom_memalign(16, block_size * sizeof(*src_)));
- ASSERT_NE(src_, nullptr);
- dst_ref_ = reinterpret_cast<uint16_t *>(
- aom_memalign(16, block_size * sizeof(*dst_ref_)));
- ASSERT_NE(dst_ref_, nullptr);
- dst_ = reinterpret_cast<uint16_t *>(
- aom_memalign(16, block_size * sizeof(*dst_)));
- ASSERT_NE(dst_, nullptr);
- }
-
- virtual void TearDown() {
- aom_free(src_);
- aom_free(dst_ref_);
- aom_free(dst_);
- }
-
- void ConvolveRoundingRun() {
- int test_num = 0;
- const int src_stride = 128;
- const int dst_stride = 128;
- int bits = 13;
- uint8_t *dst = 0;
- uint8_t *dst_ref = 0;
-
- if (data_path_ == LOWBITDEPTH_TEST) {
- dst = reinterpret_cast<uint8_t *>(dst_);
- dst_ref = reinterpret_cast<uint8_t *>(dst_ref_);
- } else if (data_path_ == HIGHBITDEPTH_TEST) {
- dst = CONVERT_TO_BYTEPTR(dst_);
- dst_ref = CONVERT_TO_BYTEPTR(dst_ref_);
- } else {
- assert(0);
- }
-
- while (test_num < kTestNum) {
- int block_size = test_num % BLOCK_SIZES_ALL;
- int w = block_size_wide[block_size];
- int h = block_size_high[block_size];
-
- if (test_num % 2 == 0)
- bits -= 1;
- else
- bits += 1;
-
- GenerateBufferWithRandom(src_, src_stride, bits, w, h);
-
- func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
- API_REGISTER_STATE_CHECK(
- func_(src_, src_stride, dst, dst_stride, w, h, bits));
-
- if (data_path_ == LOWBITDEPTH_TEST) {
- for (int r = 0; r < h; ++r) {
- for (int c = 0; c < w; ++c) {
- ASSERT_EQ(dst_ref[r * dst_stride + c], dst[r * dst_stride + c])
- << "Mismatch at r: " << r << " c: " << c << " w: " << w
- << " h: " << h << " test: " << test_num;
- }
- }
- } else {
- for (int r = 0; r < h; ++r) {
- for (int c = 0; c < w; ++c) {
- ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
- << "Mismatch at r: " << r << " c: " << c << " w: " << w
- << " h: " << h << " test: " << test_num;
- }
- }
- }
-
- test_num++;
- }
- }
-
- void GenerateBufferWithRandom(int32_t *src, int src_stride, int bits, int w,
- int h) {
- int32_t number;
- for (int r = 0; r < h; ++r) {
- for (int c = 0; c < w; ++c) {
- number = static_cast<int32_t>(rand_.Rand31());
- number %= 1 << (bits + 9);
- src[r * src_stride + c] = number;
- }
- }
- }
-
- ACMRandom rand_;
- int32_t *src_;
- uint16_t *dst_ref_;
- uint16_t *dst_;
-
- ConvolveRoundFunc func_ref_;
- ConvolveRoundFunc func_;
- DataPathType data_path_;
-};
-
-TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
-
-using std::make_tuple;
-#if HAVE_AVX2
-const ConvolveRoundParam kConvRndParamArray[] = {
- make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,
- LOWBITDEPTH_TEST),
- make_tuple(&highbd_convolve_rounding_8<av1_highbd_convolve_rounding_c>,
- &highbd_convolve_rounding_8<av1_highbd_convolve_rounding_avx2>,
- HIGHBITDEPTH_TEST),
- make_tuple(&highbd_convolve_rounding_10<av1_highbd_convolve_rounding_c>,
- &highbd_convolve_rounding_10<av1_highbd_convolve_rounding_avx2>,
- HIGHBITDEPTH_TEST),
- make_tuple(&highbd_convolve_rounding_12<av1_highbd_convolve_rounding_c>,
- &highbd_convolve_rounding_12<av1_highbd_convolve_rounding_avx2>,
- HIGHBITDEPTH_TEST)
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveRoundTest,
- ::testing::ValuesIn(kConvRndParamArray));
-#endif // HAVE_AVX2
-} // namespace
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 8aed17123..c97f81405 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -301,8 +301,6 @@ class ConvolveTestBase : public ::testing::TestWithParam<ConvolveParam> {
ASSERT_NE(output16_ref_, nullptr);
}
- virtual void TearDown() {}
-
static void TearDownTestSuite() {
aom_free(input_ - 1);
input_ = nullptr;
@@ -345,7 +343,7 @@ class ConvolveTestBase : public ::testing::TestWithParam<ConvolveParam> {
i % kOuterBlockSize >= (BorderLeft() + Width()));
}
- virtual void SetUp() {
+ void SetUp() override {
UUT_ = GET_PARAM(2);
if (UUT_->use_highbd_ != 0)
mask_ = (1 << UUT_->use_highbd_) - 1;
@@ -764,6 +762,17 @@ WRAP(convolve8_vert_avx2, 10)
WRAP(convolve8_horiz_avx2, 12)
WRAP(convolve8_vert_avx2, 12)
#endif // HAVE_AVX2
+
+#if HAVE_NEON
+WRAP(convolve8_horiz_neon, 8)
+WRAP(convolve8_vert_neon, 8)
+
+WRAP(convolve8_horiz_neon, 10)
+WRAP(convolve8_vert_neon, 10)
+
+WRAP(convolve8_horiz_neon, 12)
+WRAP(convolve8_vert_neon, 12)
+#endif // HAVE_NEON
#endif // CONFIG_AV1_HIGHBITDEPTH
#undef WRAP
@@ -866,6 +875,21 @@ INSTANTIATE_TEST_SUITE_P(AVX2, LowbdConvolveTest,
#endif // HAVE_AVX2
#if HAVE_NEON
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_neon(wrap_convolve8_horiz_neon_8,
+ wrap_convolve8_vert_neon_8, 8);
+const ConvolveFunctions wrap_convolve10_neon(wrap_convolve8_horiz_neon_10,
+ wrap_convolve8_vert_neon_10, 10);
+const ConvolveFunctions wrap_convolve12_neon(wrap_convolve8_horiz_neon_12,
+ wrap_convolve8_vert_neon_12, 12);
+const ConvolveParam kArray_HighbdConvolve8_neon[] = {
+ ALL_SIZES_64(wrap_convolve8_neon), ALL_SIZES_64(wrap_convolve10_neon),
+ ALL_SIZES_64(wrap_convolve12_neon)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdConvolveTest,
+ ::testing::ValuesIn(kArray_HighbdConvolve8_neon));
+#endif
const ConvolveFunctions convolve8_neon(aom_convolve8_horiz_neon,
aom_convolve8_vert_neon, 0);
const ConvolveParam kArray_Convolve8_neon[] = { ALL_SIZES(convolve8_neon) };
@@ -874,4 +898,25 @@ INSTANTIATE_TEST_SUITE_P(NEON, LowbdConvolveTest,
::testing::ValuesIn(kArray_Convolve8_neon));
#endif // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+const ConvolveFunctions convolve8_neon_dotprod(aom_convolve8_horiz_neon_dotprod,
+ aom_convolve8_vert_neon_dotprod,
+ 0);
+const ConvolveParam kArray_Convolve8_neon_dotprod[] = { ALL_SIZES(
+ convolve8_neon_dotprod) };
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, LowbdConvolveTest,
+ ::testing::ValuesIn(kArray_Convolve8_neon_dotprod));
+#endif // HAVE_NEON_DOTPROD
+
+#if HAVE_NEON_I8MM
+const ConvolveFunctions convolve8_neon_i8mm(aom_convolve8_horiz_neon_i8mm,
+ aom_convolve8_vert_neon_i8mm, 0);
+const ConvolveParam kArray_Convolve8_neon_i8mm[] = { ALL_SIZES(
+ convolve8_neon_i8mm) };
+
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, LowbdConvolveTest,
+ ::testing::ValuesIn(kArray_Convolve8_neon_i8mm));
+#endif // HAVE_NEON_I8MM
+
} // namespace
diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc
index 93ca8ec5a..973373218 100644
--- a/test/corner_match_test.cc
+++ b/test/corner_match_test.cc
@@ -37,10 +37,8 @@ typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam;
class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
public:
- virtual ~AV1CornerMatchTest();
- virtual void SetUp();
-
- virtual void TearDown();
+ ~AV1CornerMatchTest() override;
+ void SetUp() override;
protected:
void RunCheckOutput(int run_times);
@@ -50,12 +48,11 @@ class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CornerMatchTest);
-AV1CornerMatchTest::~AV1CornerMatchTest() {}
+AV1CornerMatchTest::~AV1CornerMatchTest() = default;
void AV1CornerMatchTest::SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
target_func = GET_PARAM(1);
}
-void AV1CornerMatchTest::TearDown() {}
void AV1CornerMatchTest::RunCheckOutput(int run_times) {
const int w = 128, h = 128;
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 5396becf4..b5f5d2974 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -28,19 +28,19 @@ class CpuSpeedTest
: EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
tune_content_(AOM_CONTENT_DEFAULT) {}
- virtual ~CpuSpeedTest() {}
+ ~CpuSpeedTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
if (encoding_mode_ != ::libaom_test::kRealTime) {
cfg_.g_lag_in_frames = 25;
}
}
- virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+ void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
@@ -52,7 +52,7 @@ class CpuSpeedTest
}
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
}
diff --git a/test/cpu_used_firstpass_test.cc b/test/cpu_used_firstpass_test.cc
index cfffcd7af..53db8b0d1 100644
--- a/test/cpu_used_firstpass_test.cc
+++ b/test/cpu_used_firstpass_test.cc
@@ -27,9 +27,9 @@ class CpuUsedFirstpassTest
protected:
CpuUsedFirstpassTest()
: EncoderTest(GET_PARAM(0)), second_pass_cpu_used_(GET_PARAM(2)) {}
- virtual ~CpuUsedFirstpassTest() {}
+ ~CpuUsedFirstpassTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kTwoPassGood);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -40,7 +40,7 @@ class CpuUsedFirstpassTest
init_flags_ = AOM_CODEC_USE_PSNR;
}
- virtual void BeginPassHook(unsigned int pass) {
+ void BeginPassHook(unsigned int pass) override {
psnr_ = 0.0;
nframes_ = 0;
@@ -50,13 +50,13 @@ class CpuUsedFirstpassTest
cpu_used_ = second_pass_cpu_used_;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 21b40d97a..a75a72fab 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -36,9 +36,9 @@ class DatarateTestLarge
}
protected:
- virtual ~DatarateTestLarge() {}
+ ~DatarateTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
ResetModel();
}
@@ -298,6 +298,72 @@ class DatarateTestLarge
<< " The datarate for the file missed the target!"
<< cfg_.rc_target_bitrate << " " << effective_datarate_;
}
+
+ virtual void BasicRateTargetingSuperresCBR() {
+ ::libaom_test::I420VideoSource video("desktopqvga2.320_240.yuv", 320, 240,
+ 30, 1, 0, 800);
+
+ cfg_.g_profile = 0;
+ cfg_.g_timebase = video.timebase();
+
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+
+ cfg_.rc_superres_mode = AOM_SUPERRES_FIXED;
+ cfg_.rc_superres_denominator = 16;
+ cfg_.rc_superres_kf_denominator = 16;
+
+ const int bitrate_array[2] = { 250, 650 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 1.15)
+ << " The datarate for the file missed the target!"
+ << cfg_.rc_target_bitrate << " " << effective_datarate_;
+ }
+
+ virtual void BasicRateTargetingSuperresCBRMultiThreads() {
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, 400);
+
+ cfg_.g_profile = 0;
+ cfg_.g_timebase = video.timebase();
+
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_threads = 2;
+
+ cfg_.rc_superres_mode = AOM_SUPERRES_FIXED;
+ cfg_.rc_superres_denominator = 16;
+ cfg_.rc_superres_kf_denominator = 16;
+
+ const int bitrate_array[2] = { 250, 650 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ tile_column_ = 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 1.15)
+ << " The datarate for the file missed the target!"
+ << cfg_.rc_target_bitrate << " " << effective_datarate_;
+ }
};
// Params: test mode, speed, aq mode.
@@ -312,9 +378,9 @@ class DatarateTestFrameDropLarge
}
protected:
- virtual ~DatarateTestFrameDropLarge() {}
+ ~DatarateTestFrameDropLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
ResetModel();
}
@@ -405,6 +471,16 @@ TEST_P(DatarateTestLarge, BasicRateTargeting444CBRScreen) {
BasicRateTargeting444CBRScreenTest();
}
+// Check basic rate targeting for Superres mode with CBR.
+TEST_P(DatarateTestLarge, BasicRateTargetingSuperresCBR) {
+ BasicRateTargetingSuperresCBR();
+}
+
+// Check basic rate targeting for Superres mode with CBR and multi-threads.
+TEST_P(DatarateTestLarge, BasicRateTargetingSuperresCBRMultiThreads) {
+ BasicRateTargetingSuperresCBRMultiThreads();
+}
+
// Check that (1) the first dropped frame gets earlier and earlier
// as the drop frame threshold is increased, and (2) that the total number of
// frame drops does not decrease as we increase frame drop threshold.
@@ -433,9 +509,9 @@ class DatarateTestSpeedChangeRealtime
}
protected:
- virtual ~DatarateTestSpeedChangeRealtime() {}
+ ~DatarateTestSpeedChangeRealtime() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
ResetModel();
}
@@ -521,6 +597,16 @@ TEST_P(DatarateTestRealtime, BasicRateTargeting444CBRScreen) {
BasicRateTargeting444CBRScreenTest();
}
+// Check basic rate targeting for Superres mode with CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargetingSuperresCBR) {
+ BasicRateTargetingSuperresCBR();
+}
+
+// Check basic rate targeting for Superres mode with CBR and multi-threads.
+TEST_P(DatarateTestRealtime, BasicRateTargetingSuperresCBRMultiThreads) {
+ BasicRateTargetingSuperresCBRMultiThreads();
+}
+
// Check that (1) the first dropped frame gets earlier and earlier
// as the drop frame threshold is increased, and (2) that the total number of
// frame drops does not decrease as we increase frame drop threshold.
@@ -540,15 +626,15 @@ class DatarateTestSetFrameQpRealtime
DatarateTestSetFrameQpRealtime() : DatarateTest(GetParam()), frame_(0) {}
protected:
- virtual ~DatarateTestSetFrameQpRealtime() {}
+ ~DatarateTestSetFrameQpRealtime() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(libaom_test::kRealTime);
ResetModel();
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
set_cpu_used_ = 7;
DatarateTest::PreEncodeFrameHook(video, encoder);
frame_qp_ = rnd_.PseudoUniform(63);
@@ -556,7 +642,7 @@ class DatarateTestSetFrameQpRealtime
frame_++;
}
- virtual void PostEncodeFrameHook(::libaom_test::Encoder *encoder) {
+ void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
if (frame_ >= total_frames_) return;
int qp = 0;
encoder->Control(AOME_GET_LAST_QUANTIZER_64, &qp);
diff --git a/test/datarate_test.h b/test/datarate_test.h
index 4b74c6533..accc1ad86 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -28,7 +28,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
speed_change_test_(false) {}
protected:
- virtual ~DatarateTest() {}
+ ~DatarateTest() override = default;
virtual void ResetModel() {
last_pts_ = 0;
@@ -57,8 +57,8 @@ class DatarateTest : public ::libaom_test::EncoderTest {
}
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
@@ -122,7 +122,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
duration_ = 0;
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
// Time since last timestamp = duration.
aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
@@ -176,7 +176,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
}
}
- virtual void EndPassHook() {
+ void EndPassHook() override {
duration_ = (last_pts_ + 1) * timebase_;
// Effective file datarate:
effective_datarate_ = (bits_total_ / 1000.0) / duration_;
diff --git a/test/decode_multithreaded_test.cc b/test/decode_multithreaded_test.cc
index 5a13f75d0..4e06f1afa 100644
--- a/test/decode_multithreaded_test.cc
+++ b/test/decode_multithreaded_test.cc
@@ -63,16 +63,16 @@ class AV1DecodeMultiThreadedTest
}
}
- virtual ~AV1DecodeMultiThreadedTest() {
+ ~AV1DecodeMultiThreadedTest() override {
delete single_thread_dec_;
for (int i = 0; i < kNumMultiThreadDecoders; ++i)
delete multi_thread_dec_[i];
}
- virtual void SetUp() { InitializeConfig(libaom_test::kTwoPassGood); }
+ void SetUp() override { InitializeConfig(libaom_test::kTwoPassGood); }
- virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
- libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
@@ -93,7 +93,7 @@ class AV1DecodeMultiThreadedTest
md5->Add(img);
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
UpdateMD5(single_thread_dec_, pkt, &md5_single_thread_);
for (int i = 0; i < kNumMultiThreadDecoders; ++i)
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 900cb677f..030035466 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -101,13 +101,12 @@ class AV1NewEncodeDecodePerfTest
protected:
AV1NewEncodeDecodePerfTest()
: EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
- outfile_(0), out_frames_(0) {}
+ outfile_(nullptr), out_frames_(0) {}
- virtual ~AV1NewEncodeDecodePerfTest() {}
+ ~AV1NewEncodeDecodePerfTest() override = default;
- virtual void SetUp() {
- InitializeConfig();
- SetMode(encoding_mode_);
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
cfg_.g_lag_in_frames = 25;
cfg_.rc_min_quantizer = 2;
@@ -121,8 +120,8 @@ class AV1NewEncodeDecodePerfTest
cfg_.rc_end_usage = AOM_VBR;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, speed_);
encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
@@ -130,7 +129,7 @@ class AV1NewEncodeDecodePerfTest
}
}
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
const char *const env = getenv("LIBAOM_TEST_DATA_PATH");
const std::string data_path(env ? env : ".");
const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
@@ -138,7 +137,7 @@ class AV1NewEncodeDecodePerfTest
ASSERT_NE(outfile_, nullptr);
}
- virtual void EndPassHook() {
+ void EndPassHook() override {
if (outfile_ != nullptr) {
if (!fseek(outfile_, 0, SEEK_SET))
ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
@@ -147,7 +146,7 @@ class AV1NewEncodeDecodePerfTest
}
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
++out_frames_;
// Write initial file header if first frame.
@@ -160,7 +159,7 @@ class AV1NewEncodeDecodePerfTest
pkt->data.frame.sz);
}
- virtual bool DoDecode() const { return false; }
+ bool DoDecode() const override { return false; }
void set_speed(unsigned int speed) { speed_ = speed; }
diff --git a/test/decode_scalability_test.cc b/test/decode_scalability_test.cc
index c04d58b09..d66c8ec71 100644
--- a/test/decode_scalability_test.cc
+++ b/test/decode_scalability_test.cc
@@ -43,7 +43,7 @@ class DecodeScalabilityTest
: DecoderTest(GET_PARAM(0)), headers_(GET_PARAM(1).headers),
num_headers_(GET_PARAM(1).num_headers) {}
- ~DecodeScalabilityTest() override {}
+ ~DecodeScalabilityTest() override = default;
void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
libaom_test::Decoder *decoder) override {
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 9678f72a2..311898ecf 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -153,7 +153,7 @@ class DecoderTest {
explicit DecoderTest(const CodecFactory *codec)
: codec_(codec), cfg_(), flags_(0) {}
- virtual ~DecoderTest() {}
+ virtual ~DecoderTest() = default;
const CodecFactory *codec_;
aom_codec_dec_cfg_t cfg_;
diff --git a/test/disflow_test.cc b/test/disflow_test.cc
new file mode 100644
index 000000000..124c9a96c
--- /dev/null
+++ b/test/disflow_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+using ComputeFlowAtPointFunc = void (*)(const uint8_t *src, const uint8_t *ref,
+ int x, int y, int width, int height,
+ int stride, double *u, double *v);
+
+class ComputeFlowTest
+ : public ::testing::TestWithParam<ComputeFlowAtPointFunc> {
+ public:
+ ComputeFlowTest()
+ : target_func_(GetParam()),
+ rnd_(libaom_test::ACMRandom::DeterministicSeed()) {}
+
+ protected:
+ void RunCheckOutput(int run_times);
+ ComputeFlowAtPointFunc target_func_;
+
+ libaom_test::ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ComputeFlowTest);
+
+void ComputeFlowTest::RunCheckOutput(int run_times) {
+ constexpr int kWidth = 352;
+ constexpr int kHeight = 288;
+
+ ::libaom_test::YUVVideoSource video("bus_352x288_420_f20_b8.yuv",
+ AOM_IMG_FMT_I420, kWidth, kHeight, 30, 1,
+ 0, 2);
+ // Use Y (Luminance) plane.
+ video.Begin();
+ uint8_t *src = video.img()->planes[0];
+ ASSERT_NE(src, nullptr);
+ video.Next();
+ uint8_t *ref = video.img()->planes[0];
+ ASSERT_NE(ref, nullptr);
+
+ // Pick a random value between -5 and 5. The range was chosen arbitrarily as
+ // u and v can take any kind of value in practise, but it shouldn't change the
+ // outcome of the tests.
+ const double u_rand = (static_cast<double>(rnd_.Rand8()) / 255) * 10 - 5;
+ double u_ref = u_rand;
+ double u_test = u_rand;
+
+ const double v_rand = (static_cast<double>(rnd_.Rand8()) / 255) * 10 - 5;
+ double v_ref = v_rand;
+ double v_test = v_rand;
+
+ // Pick a random point in the frame. If the frame is 352x288, that means we
+ // can call the function on all values of x comprised between 8 and 344, and
+ // all values of y comprised between 8 and 280.
+ const int x = rnd_((kWidth - 8) - 8 + 1) + 8;
+ const int y = rnd_((kHeight - 8) - 8 + 1) + 8;
+
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_compute_flow_at_point_c(src, ref, x, y, kWidth, kHeight, kWidth, &u_ref,
+ &v_ref);
+
+ target_func_(src, ref, x, y, kWidth, kHeight, kWidth, &u_test, &v_test);
+
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < run_times; ++i) {
+ aom_compute_flow_at_point_c(src, ref, x, y, kWidth, kHeight, kWidth,
+ &u_ref, &v_ref);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const double elapsed_time_c =
+ static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(src, ref, x, y, kWidth, kHeight, kWidth, &u_test, &v_test);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const double elapsed_time_simd =
+ static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+ printf("c_time=%fns \t simd_time=%fns \t speedup=%.2f\n", elapsed_time_c,
+ elapsed_time_simd, (elapsed_time_c / elapsed_time_simd));
+ } else {
+ ASSERT_EQ(u_ref, u_test);
+ ASSERT_EQ(v_ref, v_test);
+ }
+}
+
+TEST_P(ComputeFlowTest, CheckOutput) { RunCheckOutput(1); }
+
+TEST_P(ComputeFlowTest, DISABLED_Speed) { RunCheckOutput(10000000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ComputeFlowTest,
+ ::testing::Values(aom_compute_flow_at_point_sse4_1));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest,
+ ::testing::Values(aom_compute_flow_at_point_neon));
+#endif
+
+} // namespace
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index cbb31531a..3135d2a90 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -178,7 +178,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
}
}
- virtual ~DrPredTest() {}
+ ~DrPredTest() override = default;
void Predict(bool speedtest, int tx) {
const int kNumTests = speedtest ? kMaxNumTests : 1;
diff --git a/test/dropframe_encode_test.cc b/test/dropframe_encode_test.cc
index c7a801bbf..4a54c0b95 100644
--- a/test/dropframe_encode_test.cc
+++ b/test/dropframe_encode_test.cc
@@ -25,10 +25,10 @@ class DropFrameEncodeTestLarge
DropFrameEncodeTestLarge()
: EncoderTest(GET_PARAM(0)), frame_number_(0), threads_(GET_PARAM(2)) {}
- virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
frame_number_ = video->frame();
if (frame_number_ == 0) {
encoder->Control(AOME_SET_CPUUSED, 1);
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 2a8c07286..9a447b55a 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -9,6 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <cassert>
#include <cstdlib>
#include <cstring>
#include <tuple>
@@ -24,9 +25,9 @@
namespace {
#if CONFIG_REALTIME_ONLY
-const int kUsage = AOM_USAGE_REALTIME;
+const unsigned int kUsage = AOM_USAGE_REALTIME;
#else
-const int kUsage = AOM_USAGE_GOOD_QUALITY;
+const unsigned int kUsage = AOM_USAGE_GOOD_QUALITY;
#endif
static void *Memset16(void *dest, int val, size_t length) {
@@ -66,6 +67,22 @@ TEST(EncodeAPI, InvalidParams) {
EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
aom_codec_enc_config_default(iface, &cfg, 3));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_w = 1 << 16;
+ cfg.g_h = (1 << 14) + 1;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_w = (1 << 14) + 1;
+ cfg.g_h = 1 << 16;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_forced_max_frame_width = 1 << 16;
+ cfg.g_forced_max_frame_height = (1 << 14) + 1;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_forced_max_frame_width = (1 << 14) + 1;
+ cfg.g_forced_max_frame_height = 1 << 16;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
EXPECT_EQ(nullptr, aom_codec_get_global_headers(nullptr));
@@ -90,13 +107,12 @@ TEST(EncodeAPI, InvalidControlId) {
EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
}
-TEST(EncodeAPI, SetSFrameOnFirstFrame) {
+void EncodeSetSFrameOnFirstFrame(aom_img_fmt fmt, aom_codec_flags_t flag) {
constexpr int kWidth = 2;
constexpr int kHeight = 128;
unsigned char kBuffer[kWidth * kHeight * 3] = { 0 };
aom_image_t img;
- ASSERT_EQ(aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1, kBuffer),
- &img);
+ ASSERT_EQ(aom_img_wrap(&img, fmt, kWidth, kHeight, 1, kBuffer), &img);
aom_codec_iface_t *iface = aom_codec_av1_cx();
aom_codec_enc_cfg_t cfg;
@@ -105,15 +121,25 @@ TEST(EncodeAPI, SetSFrameOnFirstFrame) {
cfg.g_h = kHeight;
aom_codec_ctx_t enc;
- ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, flag), AOM_CODEC_OK);
// One of these aom_codec_encode() calls should fail.
if (aom_codec_encode(&enc, &img, 0, 1, AOM_EFLAG_SET_S_FRAME) ==
AOM_CODEC_OK) {
- EXPECT_NE(aom_codec_encode(&enc, NULL, 0, 0, 0), AOM_CODEC_OK);
+ EXPECT_NE(aom_codec_encode(&enc, nullptr, 0, 0, 0), AOM_CODEC_OK);
}
EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
}
+TEST(EncodeAPI, SetSFrameOnFirstFrame) {
+ EncodeSetSFrameOnFirstFrame(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(EncodeAPI, SetSFrameOnFirstFrameHighbd) {
+ EncodeSetSFrameOnFirstFrame(AOM_IMG_FMT_I42016, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
TEST(EncodeAPI, MonochromeInProfiles) {
aom_codec_iface_t *iface = aom_codec_av1_cx();
aom_codec_enc_cfg_t cfg;
@@ -147,7 +173,7 @@ TEST(EncodeAPI, LowBDEncoderLowBDImage) {
ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
aom_image_t *image =
- aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
ASSERT_NE(image, nullptr);
// Set the image to two colors so that av1_set_screen_content_options() will
@@ -184,7 +210,7 @@ TEST(EncodeAPI, HighBDEncoderHighBDImage) {
ASSERT_EQ(init_status, AOM_CODEC_OK);
aom_image_t *image =
- aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
ASSERT_NE(image, nullptr);
// Set the image to two colors so that av1_set_screen_content_options() will
@@ -222,7 +248,7 @@ TEST(EncodeAPI, HighBDEncoderLowBDImage) {
ASSERT_EQ(init_status, AOM_CODEC_OK);
aom_image_t *image =
- aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
ASSERT_NE(image, nullptr);
// Set the image to two colors so that av1_set_screen_content_options() will
@@ -255,7 +281,7 @@ TEST(EncodeAPI, LowBDEncoderHighBDImage) {
ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
aom_image_t *image =
- aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
ASSERT_NE(image, nullptr);
// Set the image to two colors so that av1_set_screen_content_options() will
@@ -278,14 +304,266 @@ TEST(EncodeAPI, LowBDEncoderHighBDImage) {
ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
}
+aom_image_t *CreateGrayImage(aom_img_fmt_t fmt, unsigned int w,
+ unsigned int h) {
+ aom_image_t *const image = aom_img_alloc(nullptr, fmt, w, h, 1);
+ if (!image) return image;
+
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+ }
+ const unsigned int uv_h = (image->d_h + 1) / 2;
+ const unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+ memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+ return image;
+}
+
+TEST(EncodeAPI, Buganizer310548198) {
+ aom_codec_iface_t *const iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ const unsigned int usage = AOM_USAGE_REALTIME;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK);
+ cfg.g_w = 1;
+ cfg.g_h = 444;
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+
+ aom_codec_ctx_t enc;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ const int speed = 6;
+ ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, speed), AOM_CODEC_OK);
+
+ const aom_enc_frame_flags_t flags = 0;
+ int frame_index = 0;
+
+ // Encode a frame.
+ aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+ ASSERT_EQ(aom_codec_encode(&enc, image, frame_index, 1, flags), AOM_CODEC_OK);
+ frame_index++;
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ }
+ aom_img_free(image);
+
+ cfg.g_w = 1;
+ cfg.g_h = 254;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_OK)
+ << aom_codec_error_detail(&enc);
+
+ cfg.g_w = 1;
+ cfg.g_h = 154;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_OK)
+ << aom_codec_error_detail(&enc);
+
+ // Encode a frame.
+ image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_EQ(aom_codec_encode(&enc, image, frame_index, 1, flags), AOM_CODEC_OK);
+ frame_index++;
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ }
+ aom_img_free(image);
+
+ // Flush the encoder.
+ bool got_data;
+ do {
+ ASSERT_EQ(aom_codec_encode(&enc, nullptr, 0, 0, 0), AOM_CODEC_OK);
+ got_data = false;
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ got_data = true;
+ }
+ } while (got_data);
+
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+// Emulates the WebCodecs VideoEncoder interface.
+class AV1Encoder {
+ public:
+ explicit AV1Encoder(int speed) : speed_(speed) {}
+ ~AV1Encoder();
+
+ void Configure(unsigned int threads, unsigned int width, unsigned int height,
+ aom_rc_mode end_usage, unsigned int usage);
+ void Encode(bool key_frame);
+
+ private:
+ // Flushes the encoder. Should be called after all the Encode() calls.
+ void Flush();
+
+ const int speed_;
+ bool initialized_ = false;
+ aom_codec_enc_cfg_t cfg_;
+ aom_codec_ctx_t enc_;
+ int frame_index_ = 0;
+};
+
+AV1Encoder::~AV1Encoder() {
+ if (initialized_) {
+ Flush();
+ EXPECT_EQ(aom_codec_destroy(&enc_), AOM_CODEC_OK);
+ }
+}
+
+void AV1Encoder::Configure(unsigned int threads, unsigned int width,
+ unsigned int height, aom_rc_mode end_usage,
+ unsigned int usage) {
+ if (!initialized_) {
+ aom_codec_iface_t *const iface = aom_codec_av1_cx();
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg_, usage), AOM_CODEC_OK);
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.g_forced_max_frame_width = cfg_.g_w;
+ cfg_.g_forced_max_frame_height = cfg_.g_h;
+ cfg_.g_timebase.num = 1;
+ cfg_.g_timebase.den = 1000 * 1000; // microseconds
+ cfg_.g_pass = AOM_RC_ONE_PASS;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_end_usage = end_usage;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 58;
+ ASSERT_EQ(aom_codec_enc_init(&enc_, iface, &cfg_, 0), AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc_, AOME_SET_CPUUSED, speed_), AOM_CODEC_OK);
+ initialized_ = true;
+ return;
+ }
+
+ ASSERT_EQ(usage, cfg_.g_usage);
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.rc_end_usage = end_usage;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc_, &cfg_), AOM_CODEC_OK)
+ << aom_codec_error_detail(&enc_);
+}
+
+void AV1Encoder::Encode(bool key_frame) {
+ assert(initialized_);
+ // TODO(wtc): Support high bit depths and other YUV formats.
+ aom_image_t *const image =
+ CreateGrayImage(AOM_IMG_FMT_I420, cfg_.g_w, cfg_.g_h);
+ ASSERT_NE(image, nullptr);
+ const aom_enc_frame_flags_t flags = key_frame ? AOM_EFLAG_FORCE_KF : 0;
+ ASSERT_EQ(aom_codec_encode(&enc_, image, frame_index_, 1, flags),
+ AOM_CODEC_OK);
+ frame_index_++;
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ if (key_frame) {
+ ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ }
+ }
+ aom_img_free(image);
+}
+
+void AV1Encoder::Flush() {
+ bool got_data;
+ do {
+ ASSERT_EQ(aom_codec_encode(&enc_, nullptr, 0, 0, 0), AOM_CODEC_OK);
+ got_data = false;
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ got_data = true;
+ }
+ } while (got_data);
+}
+
+TEST(EncodeAPI, Buganizer314858909) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(6, 1582, 750, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ encoder.Configure(0, 1582, 23, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame..
+ encoder.Encode(false);
+
+ encoder.Configure(16, 1542, 363, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame..
+ encoder.Encode(false);
+}
+
+// Run this test to reproduce the bug in fuzz test: ASSERT: cpi->rec_sse !=
+// UINT64_MAX in av1_rc_bits_per_mb.
+TEST(EncodeAPI, Buganizer310766628) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(16, 759, 383, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ encoder.Configure(2, 759, 383, AOM_VBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame. This will trigger the assertion failure.
+ encoder.Encode(false);
+}
+
+// This test covers a possible use case where the change of frame sizes and
+// thread numbers happens before and after the first frame coding.
+TEST(EncodeAPI, Buganizer310455204) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(0, 1915, 503, AOM_VBR, AOM_USAGE_REALTIME);
+
+ encoder.Configure(4, 1, 1, AOM_VBR, AOM_USAGE_REALTIME);
+
+ encoder.Configure(6, 559, 503, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ // Increase the number of threads.
+ encoder.Configure(16, 1915, 503, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+}
+
+// Run this test to reproduce the bug in fuzz test: Float-cast-overflow in
+// av1_rc_bits_per_mb.
+TEST(EncodeAPI, Buganizer310457427) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(12, 896, 1076, AOM_CBR, AOM_USAGE_REALTIME);
+
+ encoder.Configure(6, 609, 1076, AOM_VBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ // Encode a frame. This will trigger the float-cast-overflow bug which was
+ // caused by division by zero.
+ encoder.Encode(false);
+}
+
class EncodeAPIParameterized
- : public testing::TestWithParam<
- std::tuple</*usage=*/int, /*speed=*/int, /*aq_mode=*/int>> {};
+ : public testing::TestWithParam<std::tuple<
+ /*usage=*/unsigned int, /*speed=*/int, /*aq_mode=*/unsigned int>> {};
// Encodes two frames at a given usage, speed, and aq_mode setting.
// Reproduces b/303023614
TEST_P(EncodeAPIParameterized, HighBDEncoderHighBDFrames) {
- const int usage = std::get<0>(GetParam());
+ const unsigned int usage = std::get<0>(GetParam());
int speed = std::get<1>(GetParam());
if (speed == 10 && usage != AOM_USAGE_REALTIME) {
@@ -304,15 +582,15 @@ TEST_P(EncodeAPIParameterized, HighBDEncoderHighBDFrames) {
#if !CONFIG_AV1_HIGHBITDEPTH
ASSERT_EQ(init_status, AOM_CODEC_INCAPABLE);
#else
- const int aq_mode = std::get<2>(GetParam());
-
ASSERT_EQ(init_status, AOM_CODEC_OK);
+ const unsigned int aq_mode = std::get<2>(GetParam());
+
ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, speed), AOM_CODEC_OK);
ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_AQ_MODE, aq_mode), AOM_CODEC_OK);
aom_image_t *image =
- aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
ASSERT_NE(image, nullptr);
for (unsigned int i = 0; i < image->d_h; ++i) {
@@ -338,7 +616,7 @@ TEST_P(EncodeAPIParameterized, HighBDEncoderHighBDFrames) {
#endif
}
-const int kUsages[] = {
+const unsigned int kUsages[] = {
AOM_USAGE_REALTIME,
#if !CONFIG_REALTIME_ONLY
AOM_USAGE_GOOD_QUALITY,
@@ -376,6 +654,26 @@ TEST(EncodeAPI, AllIntraMode) {
cfg.kf_max_dist = 1;
EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
}
-#endif
+
+// A test that reproduces bug aomedia:3534.
+TEST(EncodeAPI, AllIntraAndNoRefLast) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA),
+ AOM_CODEC_OK);
+
+ aom_codec_ctx_t enc;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+
+ ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, AOM_EFLAG_NO_REF_LAST),
+ AOM_CODEC_OK);
+
+ aom_img_free(image);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+#endif // !CONFIG_REALTIME_ONLY
} // namespace
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index b626acd04..b52cf3392 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -63,11 +63,10 @@ class AV1EncodePerfTest
: EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
- virtual ~AV1EncodePerfTest() {}
+ ~AV1EncodePerfTest() override = default;
- virtual void SetUp() {
- InitializeConfig();
- SetMode(encoding_mode_);
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
cfg_.g_lag_in_frames = 0;
cfg_.rc_min_quantizer = 2;
@@ -83,8 +82,8 @@ class AV1EncodePerfTest
cfg_.g_threads = threads_;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
const int log2_tile_columns = 3;
encoder->Control(AOME_SET_CPUUSED, speed_);
@@ -94,19 +93,19 @@ class AV1EncodePerfTest
}
}
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
min_psnr_ = kMaxPsnr;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
if (pkt->data.psnr.psnr[0] < min_psnr_) {
min_psnr_ = pkt->data.psnr.psnr[0];
}
}
// for performance reasons don't decode
- virtual bool DoDecode() { return 0; }
+ bool DoDecode() const override { return false; }
double min_psnr() const { return min_psnr_; }
diff --git a/test/encode_small_width_height_test.cc b/test/encode_small_width_height_test.cc
index 3d00327a7..22f69396d 100644
--- a/test/encode_small_width_height_test.cc
+++ b/test/encode_small_width_height_test.cc
@@ -25,21 +25,21 @@
namespace {
// Dummy buffer of zero samples.
-constexpr unsigned char kBuffer[256 * 512 + 2 * 128 * 256] = { 0 };
+constexpr unsigned char kBuffer[2 * (256 * 512 + 2 * 128 * 256)] = { 0 };
#if CONFIG_REALTIME_ONLY
const int kUsage = 1;
#else
const int kUsage = 0;
#endif
-TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+void EncodeSmallWidthMultiThreaded(aom_img_fmt fmt, aom_codec_flags_t flag) {
// The image has only one tile and the tile is two AV1 superblocks wide.
// For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
constexpr int kWidth = 128;
constexpr int kHeight = 512;
aom_image_t img;
- EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
const_cast<unsigned char *>(kBuffer)));
aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -49,22 +49,33 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
cfg.g_w = kWidth;
cfg.g_h = kHeight;
aom_codec_ctx_t enc;
- EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
}
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+ EncodeSmallWidthMultiThreaded(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+ EncodeSmallWidthMultiThreaded(AOM_IMG_FMT_I42016, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
#if !CONFIG_REALTIME_ONLY
-TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+void EncodeSmallWidthMultiThreadedSpeed0(aom_img_fmt fmt,
+ aom_codec_flags_t flag) {
// The image has only one tile and the tile is two AV1 superblocks wide.
// For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
constexpr int kWidth = 256;
constexpr int kHeight = 512;
aom_image_t img;
- EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
const_cast<unsigned char *>(kBuffer)));
aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -74,22 +85,34 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
cfg.g_w = kWidth;
cfg.g_h = kHeight;
aom_codec_ctx_t enc;
- EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
}
+
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+ EncodeSmallWidthMultiThreadedSpeed0(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+ EncodeSmallWidthMultiThreadedSpeed0(AOM_IMG_FMT_I42016,
+ AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
#endif
-TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+void EncodeSmallHeightMultiThreaded(aom_img_fmt fmt, aom_codec_flags_t flag) {
// The image has only one tile and the tile is one AV1 superblock tall.
// For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
constexpr int kWidth = 512;
constexpr int kHeight = 64;
aom_image_t img;
- EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
const_cast<unsigned char *>(kBuffer)));
aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -99,22 +122,34 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
cfg.g_w = kWidth;
cfg.g_h = kHeight;
aom_codec_ctx_t enc;
- EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
}
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+ EncodeSmallHeightMultiThreaded(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+ EncodeSmallHeightMultiThreaded(AOM_IMG_FMT_I42016,
+ AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
#if !CONFIG_REALTIME_ONLY
-TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+void EncodeSmallHeightMultiThreadedSpeed0(aom_img_fmt fmt,
+ aom_codec_flags_t flag) {
// The image has only one tile and the tile is one AV1 superblock tall.
// For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
constexpr int kWidth = 512;
constexpr int kHeight = 128;
aom_image_t img;
- EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
const_cast<unsigned char *>(kBuffer)));
aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -124,17 +159,28 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
cfg.g_w = kWidth;
cfg.g_h = kHeight;
aom_codec_ctx_t enc;
- EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
}
+
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+ EncodeSmallHeightMultiThreadedSpeed0(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+ EncodeSmallHeightMultiThreadedSpeed0(AOM_IMG_FMT_I42016,
+ AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
#endif
// A reproducer test for aomedia:3113. The test should complete without any
// memory errors.
-TEST(EncodeSmallWidthHeight, 1x1) {
+void Encode1x1(aom_img_fmt fmt, int bitdepth, aom_codec_flags_t flags) {
constexpr int kWidth = 1;
constexpr int kHeight = 1;
@@ -144,8 +190,8 @@ TEST(EncodeSmallWidthHeight, 1x1) {
// set up img manually.
aom_image_t img;
memset(&img, 0, sizeof(img));
- img.fmt = AOM_IMG_FMT_I420;
- img.bit_depth = 8;
+ img.fmt = fmt;
+ img.bit_depth = bitdepth;
img.w = kWidth;
img.h = kHeight;
img.d_w = kWidth;
@@ -153,10 +199,14 @@ TEST(EncodeSmallWidthHeight, 1x1) {
img.x_chroma_shift = 1;
img.y_chroma_shift = 1;
img.bps = 12;
- int y_stride = kWidth;
- int uv_stride = (kWidth + 1) >> 1;
+ const int y_stride = kWidth;
+ const int uv_stride = (kWidth + 1) >> 1;
int y_height = kHeight;
int uv_height = (kHeight + 1) >> 1;
+ if (bitdepth > 8) {
+ y_height <<= 1;
+ uv_height <<= 1;
+ }
img.stride[AOM_PLANE_Y] = y_stride;
img.stride[AOM_PLANE_U] = img.stride[AOM_PLANE_V] = uv_stride;
std::unique_ptr<unsigned char[]> y_plane(
@@ -178,11 +228,19 @@ TEST(EncodeSmallWidthHeight, 1x1) {
cfg.g_w = kWidth;
cfg.g_h = kHeight;
aom_codec_ctx_t enc;
- EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flags));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
}
+TEST(EncodeSmallWidthHeight, 1x1) { Encode1x1(AOM_IMG_FMT_I420, 8, 0); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, 1x1) {
+ Encode1x1(AOM_IMG_FMT_I42016, 12, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
} // namespace
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 80be8ed06..d1e6615cd 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -199,7 +199,7 @@ class EncoderTest {
cfg_.g_threads = 1;
}
- virtual ~EncoderTest() {}
+ virtual ~EncoderTest() = default;
// Initialize the cfg_ member with the default configuration for the
// TestMode enum and maps the TestMode enum to the passes_ variable.
diff --git a/test/encodemb_test.cc b/test/encodemb_test.cc
index 4c725c7de..6165fc33f 100644
--- a/test/encodemb_test.cc
+++ b/test/encodemb_test.cc
@@ -82,7 +82,7 @@ void Dropout(TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before,
while (new_eob > 0 && qcoeff_scan[new_eob - 1] == 0) --new_eob;
EXPECT_EQ(new_eob, mb.plane[kPlane].eobs[0]);
- // Check qqcoeff is still valid.
+ // Check dqcoeff is still valid.
for (int i = 0; i < max_eob; ++i) {
EXPECT_EQ(qcoeff[i] * kDequantFactor, dqcoeff[i]);
}
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index 0a5873732..49b0fba94 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -42,9 +42,9 @@ class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
public:
EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {}
- virtual ~EncodeTxbTest() {}
+ ~EncodeTxbTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
ASSERT_NE(coeff_contexts_ref_, nullptr);
@@ -53,7 +53,7 @@ class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
ASSERT_NE(coeff_contexts_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(coeff_contexts_ref_);
aom_free(coeff_contexts_);
}
@@ -211,8 +211,7 @@ typedef std::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
class EncodeTxbInitLevelTest
: public ::testing::TestWithParam<TxbInitLevelParam> {
public:
- virtual ~EncodeTxbInitLevelTest() {}
- virtual void TearDown() {}
+ ~EncodeTxbInitLevelTest() override = default;
void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbInitLevelTest);
diff --git a/test/end_to_end_psnr_test.cc b/test/end_to_end_psnr_test.cc
index 0396438e0..687308da8 100644
--- a/test/end_to_end_psnr_test.cc
+++ b/test/end_to_end_psnr_test.cc
@@ -86,9 +86,9 @@ class EndToEndTest
cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
encoding_mode_(GET_PARAM(1)) {}
- virtual ~EndToEndTest() {}
+ ~EndToEndTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
if (encoding_mode_ == ::libaom_test::kOnePassGood ||
encoding_mode_ == ::libaom_test::kTwoPassGood) {
@@ -100,18 +100,18 @@ class EndToEndTest
}
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
diff --git a/test/end_to_end_qmpsnr_test.cc b/test/end_to_end_qmpsnr_test.cc
index de183adc5..7a755a7a5 100644
--- a/test/end_to_end_qmpsnr_test.cc
+++ b/test/end_to_end_qmpsnr_test.cc
@@ -69,7 +69,7 @@ class EndToEndQMPSNRTest
test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
ssim_(0.0) {}
- ~EndToEndQMPSNRTest() override {}
+ ~EndToEndQMPSNRTest() override = default;
void SetUp() override { InitializeConfig(encoding_mode_); }
diff --git a/test/end_to_end_ssim_test.cc b/test/end_to_end_ssim_test.cc
index 2e40c9486..f1b0cae75 100644
--- a/test/end_to_end_ssim_test.cc
+++ b/test/end_to_end_ssim_test.cc
@@ -66,7 +66,7 @@ class EndToEndSSIMTest
test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
ssim_(0.0) {}
- ~EndToEndSSIMTest() override {}
+ ~EndToEndSSIMTest() override = default;
void SetUp() override { InitializeConfig(encoding_mode_); }
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index aadbb4467..176efdf12 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -65,15 +65,13 @@ int64_t BlockErrorLpWrapper(const tran_low_t *coeff, const tran_low_t *dqcoeff,
class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
public:
- virtual ~ErrorBlockTest() {}
- virtual void SetUp() {
+ ~ErrorBlockTest() override = default;
+ void SetUp() override {
error_block_op_ = GET_PARAM(0);
ref_error_block_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
}
- virtual void TearDown() {}
-
protected:
aom_bit_depth_t bit_depth_;
ErrorBlockFunc error_block_op_;
@@ -289,6 +287,14 @@ INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
#if (HAVE_NEON)
const ErrorBlockParam kErrorBlockTestParamsNeon[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+ AOM_BITS_10),
+ make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+ AOM_BITS_12),
+ make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+ AOM_BITS_8),
+#endif
make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
&BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_neon>,
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 84330d623..d41884df2 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -37,7 +37,7 @@ class ErrorResilienceTestLarge
Reset();
}
- virtual ~ErrorResilienceTestLarge() {}
+ ~ErrorResilienceTestLarge() override = default;
void Reset() {
error_nframes_ = 0;
@@ -58,9 +58,9 @@ class ErrorResilienceTestLarge
init_flags_ = AOM_CODEC_USE_PSNR;
}
- virtual void SetUp() { InitializeConfig(encoding_mode_); }
+ void SetUp() override { InitializeConfig(encoding_mode_); }
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
psnr_ = 0.0;
nframes_ = 0;
decoded_nframes_ = 0;
@@ -68,13 +68,13 @@ class ErrorResilienceTestLarge
mismatch_nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
- virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
- libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, enable_altref_);
@@ -146,7 +146,7 @@ class ErrorResilienceTestLarge
}
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
// Check that the encode frame flags are correctly reflected
// in the output frame flags.
const int encode_flags = pkt->data.frame.flags >> 16;
@@ -176,21 +176,21 @@ class ErrorResilienceTestLarge
return 0.0;
}
- virtual bool DoDecode() const {
+ bool DoDecode() const override {
if (error_nframes_ > 0 &&
(cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
for (unsigned int i = 0; i < error_nframes_; ++i) {
if (error_frames_[i] == nframes_ - 1) {
std::cout << " Skipping decoding frame: "
<< error_frames_[i] << "\n";
- return 0;
+ return false;
}
}
}
- return 1;
+ return true;
}
- virtual bool DoDecodeInvisible() const {
+ bool DoDecodeInvisible() const override {
if (invisible_error_nframes_ > 0 &&
(cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
for (unsigned int i = 0; i < invisible_error_nframes_; ++i) {
@@ -198,14 +198,14 @@ class ErrorResilienceTestLarge
std::cout << " Skipping decoding all invisible frames in "
"frame pkt: "
<< invisible_error_frames_[i] << "\n";
- return 0;
+ return false;
}
}
}
- return 1;
+ return true;
}
- virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+ void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
if (allow_mismatch_) {
double mismatch_psnr = compute_psnr(img1, img2);
mismatch_psnr_ += mismatch_psnr;
@@ -216,8 +216,8 @@ class ErrorResilienceTestLarge
}
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t pts) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
(void)img;
(void)pts;
++decoded_nframes_;
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 6b7fcceff..ce45394eb 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -40,9 +40,9 @@ class AVxFirstPassEncoderThreadTest
firstpass_stats_.buf = nullptr;
firstpass_stats_.sz = 0;
}
- virtual ~AVxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+ ~AVxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); }
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
cfg_.g_lag_in_frames = 35;
@@ -53,18 +53,18 @@ class AVxFirstPassEncoderThreadTest
cfg_.rc_min_quantizer = 0;
}
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
encoder_initialized_ = false;
abort_ = false;
}
- virtual void EndPassHook() {
+ void EndPassHook() override {
// For first pass stats test, only run first pass encoder.
if (cfg_.g_pass == AOM_RC_FIRST_PASS) abort_ = true;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+ ::libaom_test::Encoder *encoder) override {
if (!encoder_initialized_) {
// Encode in 2-pass mode.
SetTileSize(encoder);
@@ -84,7 +84,7 @@ class AVxFirstPassEncoderThreadTest
encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
}
- virtual void StatsPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void StatsPktHook(const aom_codec_cx_pkt_t *pkt) override {
const uint8_t *const pkt_buf =
reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -227,9 +227,9 @@ class AVxEncoderThreadTest
md5_dec_.clear();
md5_enc_.clear();
}
- virtual ~AVxEncoderThreadTest() { delete decoder_; }
+ ~AVxEncoderThreadTest() override { delete decoder_; }
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
if (encoding_mode_ == ::libaom_test::kOnePassGood ||
@@ -244,12 +244,12 @@ class AVxEncoderThreadTest
cfg_.rc_min_quantizer = 0;
}
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
encoder_initialized_ = false;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+ ::libaom_test::Encoder *encoder) override {
if (!encoder_initialized_) {
SetTileSize(encoder);
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -290,7 +290,7 @@ class AVxEncoderThreadTest
encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
size_enc_.push_back(pkt->data.frame.sz);
::libaom_test::MD5 md5_enc;
@@ -531,15 +531,15 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTestLarge,
#endif // !CONFIG_REALTIME_ONLY
class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
- virtual void SetTileSize(libaom_test::Encoder *encoder) {
+ void SetTileSize(libaom_test::Encoder *encoder) override {
encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
}
- virtual void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
- const std::vector<size_t> ref_size_enc,
- const std::vector<std::string> ref_md5_enc,
- const std::vector<std::string> ref_md5_dec) {
+ void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+ const std::vector<size_t> ref_size_enc,
+ const std::vector<std::string> ref_md5_enc,
+ const std::vector<std::string> ref_md5_dec) override {
(void)video;
(void)ref_size_enc;
(void)ref_md5_enc;
diff --git a/test/examples.sh b/test/examples.sh
index 87d8c2b03..771a7b6dc 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -15,7 +15,7 @@
example_tests=$(ls -r $(dirname $0)/*.sh)
# List of script names to exclude.
-exclude_list="best_encode examples run_encodes tools_common"
+exclude_list="best_encode examples run_encodes tools_common av1_c_vs_simd_encode"
if [ "$(realtime_only_build)" = "yes" ]; then
exclude_list="${exclude_list} twopass_encoder simple_decoder lightfield_test"
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index ea7ed5043..8f16c4e2d 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -214,13 +214,12 @@ class ExternalFrameBufferMD5Test
: DecoderTest(GET_PARAM(::libaom_test::kCodecFactoryParam)),
md5_file_(nullptr), num_buffers_(0) {}
- virtual ~ExternalFrameBufferMD5Test() {
+ ~ExternalFrameBufferMD5Test() override {
if (md5_file_ != nullptr) fclose(md5_file_);
}
- virtual void PreDecodeFrameHook(
- const libaom_test::CompressedVideoSource &video,
- libaom_test::Decoder *decoder) {
+ void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) override {
if (num_buffers_ > 0 && video.frame_number() == 0) {
// Have libaom use frame buffers we create.
ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
@@ -236,8 +235,8 @@ class ExternalFrameBufferMD5Test
<< "Md5 file open failed. Filename: " << md5_file_name_;
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- const unsigned int frame_number) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int frame_number) override {
ASSERT_NE(md5_file_, nullptr);
char expected_md5[33];
char junk[128];
@@ -315,7 +314,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
ExternalFrameBufferTest()
: video_(nullptr), decoder_(nullptr), num_buffers_(0) {}
- virtual void SetUp() {
+ void SetUp() override {
video_ = new libaom_test::WebMVideoSource(kAV1TestFile);
ASSERT_NE(video_, nullptr);
video_->Init();
@@ -327,7 +326,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
ASSERT_NE(decoder_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
delete decoder_;
decoder_ = nullptr;
delete video_;
@@ -383,7 +382,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
protected:
- virtual void SetUp() {
+ void SetUp() override {
video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile);
ASSERT_NE(video_, nullptr);
video_->Init();
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 046d8107b..9cbf208ad 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -59,10 +59,10 @@ template <typename OutputType>
class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
public ::testing::TestWithParam<Fdct4x4Param<OutputType>> {
public:
- virtual ~Trans4x4FDCT() {}
+ ~Trans4x4FDCT() override = default;
using TxfmBaseOutType = libaom_test::TransformTestBase<OutputType>;
- virtual void SetUp() {
+ void SetUp() override {
fwd_txfm_ = std::get<0>(this->GetParam());
TxfmBaseOutType::pitch_ = 4;
TxfmBaseOutType::height_ = 4;
@@ -71,14 +71,13 @@ class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1;
TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam());
}
- virtual void TearDown() {}
protected:
- void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) {
+ void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) override {
fwd_txfm_(in, out, stride);
}
- void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) override {
(void)out;
(void)dst;
(void)stride;
diff --git a/test/fft_test.cc b/test/fft_test.cc
index 5443c99a9..06a17a3f8 100644
--- a/test/fft_test.cc
+++ b/test/fft_test.cc
@@ -88,7 +88,7 @@ std::ostream &operator<<(std::ostream &os, const FFTTestArg &test_arg) {
class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
protected:
- void SetUp() {
+ void SetUp() override {
int n = GetParam().n;
input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
@@ -100,7 +100,7 @@ class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
memset(temp_, 0, sizeof(*temp_) * n * n);
memset(output_, 0, sizeof(*output_) * n * n * 2);
}
- void TearDown() {
+ void TearDown() override {
aom_free(input_);
aom_free(temp_);
aom_free(output_);
@@ -178,7 +178,7 @@ std::ostream &operator<<(std::ostream &os, const IFFTTestArg &test_arg) {
class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
protected:
- void SetUp() {
+ void SetUp() override {
int n = GetParam().n;
input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
@@ -190,7 +190,7 @@ class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
memset(temp_, 0, sizeof(*temp_) * n * n * 2);
memset(output_, 0, sizeof(*output_) * n * n);
}
- void TearDown() {
+ void TearDown() override {
aom_free(input_);
aom_free(temp_);
aom_free(output_);
diff --git a/test/film_grain_table_test.cc b/test/film_grain_table_test.cc
index f8937f1df..808d966fe 100644
--- a/test/film_grain_table_test.cc
+++ b/test/film_grain_table_test.cc
@@ -91,7 +91,7 @@ TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
// Extend the existing segment
aom_film_grain_table_append(&table, 2000, 3000, film_grain_test_vectors + 0);
- EXPECT_EQ(0, table.head->next);
+ EXPECT_EQ(nullptr, table.head->next);
// Lookup and remove and check that the entry is no longer there
EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
@@ -100,8 +100,8 @@ TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
EXPECT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, true, &grain));
EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
- EXPECT_EQ(0, table.head);
- EXPECT_EQ(0, table.tail);
+ EXPECT_EQ(nullptr, table.head);
+ EXPECT_EQ(nullptr, table.tail);
aom_film_grain_table_free(&table);
}
@@ -114,8 +114,8 @@ TEST(FilmGrainTableTest, AddSingleSegmentRemoveBiggerSegment) {
aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 1100, true, &grain));
- EXPECT_EQ(0, table.head);
- EXPECT_EQ(0, table.tail);
+ EXPECT_EQ(nullptr, table.head);
+ EXPECT_EQ(nullptr, table.tail);
aom_film_grain_table_free(&table);
}
@@ -180,7 +180,7 @@ TEST(FilmGrainTableTest, AddAndLookupMultipleSegments) {
class FilmGrainTableIOTest : public ::testing::Test {
protected:
- void SetUp() { memset(&error_, 0, sizeof(error_)); }
+ void SetUp() override { memset(&error_, 0, sizeof(error_)); }
struct aom_internal_error_info error_;
};
@@ -280,32 +280,35 @@ const ::libaom_test::TestMode kFilmGrainEncodeTestModes[] = {
};
class FilmGrainEncodeTest
- : public ::libaom_test::CodecTestWith2Params<bool, ::libaom_test::TestMode>,
+ : public ::libaom_test::CodecTestWith3Params<int, int,
+ ::libaom_test::TestMode>,
public ::libaom_test::EncoderTest {
protected:
FilmGrainEncodeTest()
: EncoderTest(GET_PARAM(0)), test_monochrome_(GET_PARAM(1)),
- test_mode_(GET_PARAM(2)) {}
+ key_frame_dist_(GET_PARAM(2)), test_mode_(GET_PARAM(3)) {}
~FilmGrainEncodeTest() override = default;
void SetUp() override {
InitializeConfig(test_mode_);
- cfg_.monochrome = test_monochrome_;
+ cfg_.monochrome = test_monochrome_ == 1;
cfg_.rc_target_bitrate = 300;
- cfg_.kf_max_dist = 0;
+ cfg_.kf_max_dist = key_frame_dist_;
+ cfg_.g_lag_in_frames = 0;
}
void PreEncodeFrameHook(::libaom_test::VideoSource *video,
::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
- encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_CPUUSED,
+ test_mode_ == ::libaom_test::kRealTime ? 7 : 5);
encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_FILM);
encoder->Control(AV1E_SET_DENOISE_NOISE_LEVEL, 1);
} else if (video->frame() == 1) {
- cfg_.monochrome = 0;
+ cfg_.monochrome = (test_monochrome_ == 1 || test_monochrome_ == 2);
encoder->Config(&cfg_);
} else {
- cfg_.monochrome = test_monochrome_;
+ cfg_.monochrome = test_monochrome_ == 1;
encoder->Config(&cfg_);
}
}
@@ -313,11 +316,6 @@ class FilmGrainEncodeTest
bool DoDecode() const override { return false; }
void DoTest() {
- if (test_monochrome_ && test_mode_ == ::libaom_test::kRealTime) {
- // TODO(bohanli): Running real time mode with monochrome will cause the
- // encoder to crash. Check if this is intended or there is a bug.
- GTEST_SKIP();
- }
::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
288, 30, 1, 0, 3);
cfg_.g_w = video.img()->d_w;
@@ -326,11 +324,58 @@ class FilmGrainEncodeTest
}
private:
- bool test_monochrome_;
+ // 0: monochroome always off.
+ // 1: monochrome always on.
+ // 2: monochrome changes from 0, 1, 0, for encoded frames 0, 1, 2.
+ // The case where monochrome changes from 1 to 0 (i.e., encoder initialized
+ // with monochrome = 1 and then subsequently encoded with monochrome = 0)
+ // will fail. The test InitMonochrome1_EncodeMonochrome0 below verifies this.
+ int test_monochrome_;
+ int key_frame_dist_;
::libaom_test::TestMode test_mode_;
};
TEST_P(FilmGrainEncodeTest, Test) { DoTest(); }
-AV1_INSTANTIATE_TEST_SUITE(FilmGrainEncodeTest, ::testing::Bool(),
+AV1_INSTANTIATE_TEST_SUITE(FilmGrainEncodeTest, ::testing::Range(0, 3),
+ ::testing::Values(0, 10),
::testing::ValuesIn(kFilmGrainEncodeTestModes));
+
+// Initialize encoder with monochrome = 1, and then encode frame with
+// monochrome = 0. This will result in an error: see the following check
+// in encoder_set_config() in av1/av1_cx_iface.c.
+// TODO(marpan): Consider moving this test to another file, as the failure
+// has nothing to do with film grain mode.
+TEST(FilmGrainEncodeTest, InitMonochrome1EncodeMonochrome0) {
+ const int kWidth = 352;
+ const int kHeight = 288;
+ const int usage = AOM_USAGE_REALTIME;
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK);
+ aom_codec_ctx_t enc;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ // Initialize encoder, with monochrome = 0.
+ cfg.monochrome = 1;
+ aom_codec_err_t init_status = aom_codec_enc_init(&enc, iface, &cfg, 0);
+ ASSERT_EQ(init_status, AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 7), AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TUNE_CONTENT, AOM_CONTENT_FILM),
+ AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_DENOISE_NOISE_LEVEL, 1),
+ AOM_CODEC_OK);
+ // Set image with zero values.
+ constexpr size_t kBufferSize =
+ kWidth * kHeight + 2 * (kWidth + 1) / 2 * (kHeight + 1) / 2;
+ std::vector<unsigned char> buffer(kBufferSize);
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+ // Encode first frame.
+ ASSERT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ // Second frame: update config with monochrome = 1.
+ cfg.monochrome = 0;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_INVALID_PARAM);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
index c54bec5e5..0a0ab11dc 100644
--- a/test/filterintra_test.cc
+++ b/test/filterintra_test.cc
@@ -41,8 +41,8 @@ const int MaxTestNum = 100;
class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
public:
- virtual ~AV1FilterIntraPredTest() {}
- virtual void SetUp() {
+ ~AV1FilterIntraPredTest() override = default;
+ void SetUp() override {
PredFuncMode funcMode = GET_PARAM(0);
predFuncRef_ = std::get<0>(funcMode);
predFunc_ = std::get<1>(funcMode);
@@ -57,7 +57,7 @@ class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
ASSERT_NE(pred_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
delete[] alloc_;
delete[] predRef_;
delete[] pred_;
diff --git a/test/forced_max_frame_width_height_test.cc b/test/forced_max_frame_width_height_test.cc
index 2e019b638..3347713c5 100644
--- a/test/forced_max_frame_width_height_test.cc
+++ b/test/forced_max_frame_width_height_test.cc
@@ -114,10 +114,7 @@ void FillImageGradient(aom_image_t *image, int bit_depth) {
}
}
-// A test that reproduces bug aomedia:3348: Assertion
-// `ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride'
-// failed.
-TEST(EncodeForcedMaxFrameWidthHeight, DISABLED_DimensionDecreasing) {
+TEST(EncodeForcedMaxFrameWidthHeight, DimensionDecreasing) {
constexpr int kWidth = 128;
constexpr int kHeight = 128;
constexpr size_t kBufferSize = 3 * kWidth * kHeight;
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
deleted file mode 100644
index c355efc8a..000000000
--- a/test/frame_error_test.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-namespace {
-typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
- const uint8_t *const dst, int p_width,
- int p_height, int p_stride);
-#if HAVE_AVX2 || HAVE_SSE2
-const int kBlockWidth[] = {
- 832, 834, 640, 1280, 1920,
-};
-const int kBlockHeight[] = {
- 480, 482, 360, 720, 1080,
-};
-#endif
-typedef std::tuple<frame_error_func, int, int> FrameErrorParam;
-
-class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
- public:
- virtual ~AV1FrameErrorTest() {}
- virtual void SetUp() {
- rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
- }
- virtual void TearDown() {}
-
- protected:
- void RandomValues(frame_error_func test_impl, int width, int height);
- void ExtremeValues(frame_error_func test_impl, int width, int height);
- void RunSpeedTest(frame_error_func test_impl, int width, int height);
- libaom_test::ACMRandom rnd_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FrameErrorTest);
-
-void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
- int height) {
- const int stride = (((width * 3) / 2) + 15) & ~15;
- const int max_blk_size = stride * height;
- uint8_t *const dst =
- static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
- uint8_t *const ref =
- static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
- ASSERT_NE(dst, nullptr);
- ASSERT_NE(ref, nullptr);
- for (int i = 0; i < max_blk_size; ++i) {
- dst[i] = rnd_.Rand8();
- ref[i] = rnd_.Rand8();
- }
- const int64_t ref_error =
- av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
- const int64_t test_error = test_impl(ref, stride, dst, width, height, stride);
- ASSERT_EQ(test_error, ref_error) << width << "x" << height;
- aom_free(dst);
- aom_free(ref);
-}
-
-void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
- int height) {
- const int stride = (((width * 3) / 2) + 15) & ~15;
- const int max_blk_size = stride * height;
- uint8_t *const dst =
- static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
- uint8_t *const ref =
- static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
- ASSERT_NE(dst, nullptr);
- ASSERT_NE(ref, nullptr);
- for (int r = 0; r < 2; r++) {
- if (r == 0) {
- memset(dst, 0, max_blk_size);
- memset(ref, 255, max_blk_size);
- } else if (r == 1) {
- memset(dst, 255, max_blk_size);
- memset(ref, 0, max_blk_size);
- }
- const int64_t ref_error =
- av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
- const int64_t test_error =
- test_impl(ref, stride, dst, width, height, stride);
- ASSERT_EQ(test_error, ref_error) << width << "x" << height;
- }
- aom_free(dst);
- aom_free(ref);
-}
-
-void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
- int height) {
- const int stride = (((width * 3) / 2) + 15) & ~15;
- const int max_blk_size = stride * height;
- uint8_t *const dst =
- static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
- uint8_t *const ref =
- static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
- ASSERT_NE(dst, nullptr);
- ASSERT_NE(ref, nullptr);
- for (int i = 0; i < max_blk_size; ++i) {
- dst[i] = ref[i] = rnd_.Rand8();
- }
- const int num_loops = 10000000 / (width + height);
- frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
- double elapsed_time[2] = { 0 };
- for (int i = 0; i < 2; ++i) {
- aom_usec_timer timer;
- aom_usec_timer_start(&timer);
- frame_error_func func = funcs[i];
- for (int j = 0; j < num_loops; ++j) {
- func(ref, stride, dst, width, height, stride);
- }
- aom_usec_timer_mark(&timer);
- double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
- elapsed_time[i] = 1000.0 * time / num_loops;
- }
- aom_free(dst);
- aom_free(ref);
- printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", width, height,
- elapsed_time[0], elapsed_time[1]);
- printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
-}
-
-TEST_P(AV1FrameErrorTest, CheckOutput) {
- RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
- ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
- RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
- SSE2, AV1FrameErrorTest,
- ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
- ::testing::ValuesIn(kBlockWidth),
- ::testing::ValuesIn(kBlockHeight)));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
- AVX2, AV1FrameErrorTest,
- ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
- ::testing::ValuesIn(kBlockWidth),
- ::testing::ValuesIn(kBlockHeight)));
-#endif
-} // namespace
diff --git a/test/frame_parallel_enc_test.cc b/test/frame_parallel_enc_test.cc
index 7508eb746..86d5ddb7d 100644
--- a/test/frame_parallel_enc_test.cc
+++ b/test/frame_parallel_enc_test.cc
@@ -36,9 +36,9 @@ class AVxFrameParallelThreadEncodeTest
cfg.allow_lowbitdepth = 1;
decoder_ = codec_->CreateDecoder(cfg, 0);
}
- virtual ~AVxFrameParallelThreadEncodeTest() { delete decoder_; }
+ ~AVxFrameParallelThreadEncodeTest() override { delete decoder_; }
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kTwoPassGood);
cfg_.rc_end_usage = AOM_VBR;
cfg_.g_lag_in_frames = 35;
@@ -49,12 +49,12 @@ class AVxFrameParallelThreadEncodeTest
cfg_.g_threads = 16;
}
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
encoder_initialized_ = false;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+ ::libaom_test::Encoder *encoder) override {
if (encoder_initialized_) return;
SetTileSize(encoder);
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -73,7 +73,7 @@ class AVxFrameParallelThreadEncodeTest
encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
size_enc_.push_back(pkt->data.frame.sz);
::libaom_test::MD5 md5_enc;
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 3b35db89e..ea8cf47ab 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -24,18 +24,18 @@ class AV1FrameSizeTests : public ::testing::Test,
protected:
AV1FrameSizeTests()
: EncoderTest(&::libaom_test::kAV1), expected_res_(AOM_CODEC_OK) {}
- virtual ~AV1FrameSizeTests() {}
+ ~AV1FrameSizeTests() override = default;
- virtual void SetUp() { InitializeConfig(::libaom_test::kRealTime); }
+ void SetUp() override { InitializeConfig(::libaom_test::kRealTime); }
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
return !::testing::Test::HasFailure();
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 7);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -312,15 +312,13 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Range(6, 11)));
#if !CONFIG_REALTIME_ONLY
-// TODO(https://crbug.com/aomedia/3348): Modes that use av1_full_pixel_search()
-// will cause an assert.
INSTANTIATE_TEST_SUITE_P(
- DISABLED_GoodQuality, AV1ResolutionChange,
+ GoodQuality, AV1ResolutionChange,
::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
::testing::Range(2, 6)));
INSTANTIATE_TEST_SUITE_P(
- DISABLED_GoodQualityLarge, AV1ResolutionChange,
+ GoodQualityLarge, AV1ResolutionChange,
::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
::testing::Range(0, 2)));
@@ -350,18 +348,18 @@ class AV1LosslessFrameSizeTests
AV1LosslessFrameSizeTests()
: EncoderTest(GET_PARAM(0)), frame_size_param_(GET_PARAM(1)),
encoding_mode_(GET_PARAM(2)) {}
- virtual ~AV1LosslessFrameSizeTests() {}
+ ~AV1LosslessFrameSizeTests() override = default;
- virtual void SetUp() { InitializeConfig(encoding_mode_); }
+ void SetUp() override { InitializeConfig(encoding_mode_); }
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
return !::testing::Test::HasFailure();
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 6);
encoder->Control(AV1E_SET_LOSSLESS, 1);
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
index fc2a76933..2268b9f2a 100644
--- a/test/function_equivalence_test.h
+++ b/test/function_equivalence_test.h
@@ -55,11 +55,9 @@ class FunctionEquivalenceTest : public ::testing::TestWithParam<FuncParam<T> > {
public:
FunctionEquivalenceTest() : rng_(ACMRandom::DeterministicSeed()) {}
- virtual ~FunctionEquivalenceTest() {}
+ ~FunctionEquivalenceTest() override = default;
- virtual void SetUp() { params_ = this->GetParam(); }
-
- virtual void TearDown() {}
+ void SetUp() override { params_ = this->GetParam(); }
protected:
ACMRandom rng_;
diff --git a/test/fwht4x4_test.cc b/test/fwht4x4_test.cc
index 9d27db88e..bb9e218f6 100644
--- a/test/fwht4x4_test.cc
+++ b/test/fwht4x4_test.cc
@@ -67,9 +67,9 @@ void iwht4x4_12_sse4_1(const tran_low_t *in, uint8_t *out, int stride) {
class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
public ::testing::TestWithParam<Dct4x4Param> {
public:
- virtual ~Trans4x4WHT() {}
+ ~Trans4x4WHT() override = default;
- virtual void SetUp() {
+ void SetUp() override {
fwd_txfm_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
pitch_ = 4;
@@ -80,13 +80,12 @@ class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
num_coeffs_ = GET_PARAM(4);
fwd_txfm_c_ = GET_PARAM(5);
}
- virtual void TearDown() {}
protected:
- void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+ void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) override {
fwd_txfm_(in, out, stride);
}
- void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) override {
inv_txfm_(out, dst, stride);
}
void RunSpeedTest() {
diff --git a/test/gf_pyr_height_test.cc b/test/gf_pyr_height_test.cc
index a2d1a8fe4..0996d80c2 100644
--- a/test/gf_pyr_height_test.cc
+++ b/test/gf_pyr_height_test.cc
@@ -79,9 +79,9 @@ class GFPyrHeightTest
gf_max_pyr_height_ = GET_PARAM(3).gf_max_pyr_height;
psnr_threshold_ = GET_PARAM(3).psnr_thresh;
}
- virtual ~GFPyrHeightTest() {}
+ ~GFPyrHeightTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -95,18 +95,18 @@ class GFPyrHeightTest
init_flags_ = AOM_CODEC_USE_PSNR;
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
if (rc_mode_ == AOM_Q) {
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index fc306e623..b01e78faa 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -240,7 +240,7 @@ class HadamardTestBase
shift_ = do_shift;
}
- virtual void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
// The Rand() function generates values in the range [-((1 << BitDepth) - 1),
// (1 << BitDepth) - 1]. This is because the input to the Hadamard transform
@@ -252,7 +252,7 @@ class HadamardTestBase
void CompareReferenceRandom() {
const int kMaxBlockSize = 32 * 32;
- const int block_size_ = bw_ * bh_;
+ const int block_size = bw_ * bh_;
DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
@@ -262,13 +262,13 @@ class HadamardTestBase
OutputType b_ref[kMaxBlockSize];
memset(b_ref, 0, sizeof(b_ref));
- for (int i = 0; i < block_size_; ++i) a[i] = Rand();
+ for (int i = 0; i < block_size; ++i) a[i] = Rand();
ReferenceHadamard(a, bw_, b_ref, bw_, bh_, shift_);
API_REGISTER_STATE_CHECK(h_func_(a, bw_, b));
// The order of the output is not important. Sort before checking.
- std::sort(b, b + block_size_);
- std::sort(b_ref, b_ref + block_size_);
+ std::sort(b, b + block_size);
+ std::sort(b_ref, b_ref + block_size);
EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
}
@@ -298,12 +298,12 @@ class HadamardTestBase
void VaryStride() {
const int kMaxBlockSize = 32 * 32;
- const int block_size_ = bw_ * bh_;
+ const int block_size = bw_ * bh_;
DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
memset(a, 0, sizeof(a));
- for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
+ for (int i = 0; i < block_size * 8; ++i) a[i] = Rand();
OutputType b_ref[kMaxBlockSize];
for (int i = 8; i < 64; i += 8) {
@@ -314,8 +314,8 @@ class HadamardTestBase
API_REGISTER_STATE_CHECK(h_func_(a, i, b));
// The order of the output is not important. Sort before checking.
- std::sort(b, b + block_size_);
- std::sort(b_ref, b_ref + block_size_);
+ std::sort(b, b + block_size);
+ std::sort(b_ref, b_ref + block_size);
EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
}
}
@@ -338,6 +338,7 @@ class HadamardTestBase
printf("Hadamard%dx%d[%12d runs]: %d us\n", bw_, bh_, times, elapsed_time);
}
+ protected:
ACMRandom rnd_;
private:
@@ -351,7 +352,7 @@ class HadamardLowbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
public:
HadamardLowbdTest() : HadamardTestBase(GetParam(), /*do_shift=*/true) {}
// Use values between -255 (0xFF01) and 255 (0x00FF)
- virtual int16_t Rand() {
+ int16_t Rand() override {
int16_t src = rnd_.Rand8();
int16_t pred = rnd_.Rand8();
return src - pred;
@@ -407,7 +408,7 @@ class HadamardHighbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
protected:
HadamardHighbdTest() : HadamardTestBase(GetParam(), /*do_shift=*/true) {}
// Use values between -4095 (0xF001) and 4095 (0x0FFF)
- virtual int16_t Rand() {
+ int16_t Rand() override {
int16_t src = rnd_.Rand12();
int16_t pred = rnd_.Rand12();
return src - pred;
@@ -431,6 +432,15 @@ INSTANTIATE_TEST_SUITE_P(
HadamardFuncWithSize(&aom_highbd_hadamard_16x16_c, 16, 16),
HadamardFuncWithSize(&aom_highbd_hadamard_32x32_c, 32, 32)));
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, HadamardHighbdTest,
+ ::testing::Values(
+ HadamardFuncWithSize(&aom_highbd_hadamard_8x8_avx2, 8, 8),
+ HadamardFuncWithSize(&aom_highbd_hadamard_16x16_avx2, 16, 16),
+ HadamardFuncWithSize(&aom_highbd_hadamard_32x32_avx2, 32, 32)));
+#endif // HAVE_AVX2
+
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(
NEON, HadamardHighbdTest,
@@ -447,7 +457,7 @@ class HadamardLowbdLPTest : public HadamardTestBase<int16_t, HadamardLPFunc> {
public:
HadamardLowbdLPTest() : HadamardTestBase(GetParam(), /*do_shift=*/false) {}
// Use values between -255 (0xFF01) and 255 (0x00FF)
- virtual int16_t Rand() {
+ int16_t Rand() override {
int16_t src = rnd_.Rand8();
int16_t pred = rnd_.Rand8();
return src - pred;
@@ -497,7 +507,7 @@ class HadamardLowbdLP8x8DualTest
HadamardLowbdLP8x8DualTest()
: HadamardTestBase(GetParam(), /*do_shift=*/false) {}
// Use values between -255 (0xFF01) and 255 (0x00FF)
- virtual int16_t Rand() {
+ int16_t Rand() override {
int16_t src = rnd_.Rand8();
int16_t pred = rnd_.Rand8();
return src - pred;
diff --git a/test/hash_test.cc b/test/hash_test.cc
index 61e0b5179..a1de9323d 100644
--- a/test/hash_test.cc
+++ b/test/hash_test.cc
@@ -31,10 +31,10 @@ typedef std::tuple<get_crc32c_value_func, int> HashParam;
class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
public:
- ~AV1Crc32cHashTest();
- void SetUp();
+ ~AV1Crc32cHashTest() override;
+ void SetUp() override;
- void TearDown();
+ void TearDown() override;
protected:
void RunCheckOutput(get_crc32c_value_func test_impl);
@@ -49,7 +49,7 @@ class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
size_t length_;
};
-AV1Crc32cHashTest::~AV1Crc32cHashTest() {}
+AV1Crc32cHashTest::~AV1Crc32cHashTest() = default;
void AV1Crc32cHashTest::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index 074213a47..303d580c4 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -94,7 +94,7 @@ double compute_aomssim(const YV12_BUFFER_CONFIG *source,
class HBDMetricsTestBase {
public:
- virtual ~HBDMetricsTestBase() {}
+ virtual ~HBDMetricsTestBase() = default;
protected:
void RunAccuracyCheck() {
@@ -179,14 +179,13 @@ typedef std::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
class HBDMetricsTest : public HBDMetricsTestBase,
public ::testing::TestWithParam<MetricTestTParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
lbd_metric_ = GET_PARAM(0);
hbd_metric_ = GET_PARAM(1);
input_bit_depth_ = GET_PARAM(2);
bit_depth_ = GET_PARAM(3);
threshold_ = GET_PARAM(4);
}
- virtual void TearDown() {}
};
TEST_P(HBDMetricsTest, RunAccuracyCheck) { RunAccuracyCheck(); }
diff --git a/test/hiprec_convolve_test.cc b/test/hiprec_convolve_test.cc
index 3e93a06b5..78883ccdd 100644
--- a/test/hiprec_convolve_test.cc
+++ b/test/hiprec_convolve_test.cc
@@ -47,7 +47,7 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1HiprecConvolveTest,
#endif
#if CONFIG_AV1_HIGHBITDEPTH
-#if HAVE_SSSE3 || HAVE_AVX2
+#if HAVE_SSSE3 || HAVE_AVX2 || HAVE_NEON
TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
RunCheckOutput(GET_PARAM(4));
}
@@ -64,6 +64,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdHiprecConvolveTest,
libaom_test::AV1HighbdHiprecConvolve::BuildParams(
av1_highbd_wiener_convolve_add_src_avx2));
#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdHiprecConvolveTest,
+ libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+ av1_highbd_wiener_convolve_add_src_neon));
+#endif
#endif
#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/hiprec_convolve_test_util.cc b/test/hiprec_convolve_test_util.cc
index e2496b323..6d7902fd0 100644
--- a/test/hiprec_convolve_test_util.cc
+++ b/test/hiprec_convolve_test_util.cc
@@ -26,21 +26,21 @@ namespace libaom_test {
static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
InterpKernel vkernel, int kernel_type = 2) {
if (kernel_type == 0) {
- // Low possible values for filter coefficients
+ // Low possible values for filter coefficients, 7-tap kernel
hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV;
hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
hkernel[7] = vkernel[7] = 0;
} else if (kernel_type == 1) {
- // Max possible values for filter coefficients
+ // Max possible values for filter coefficients, 7-tap kernel
hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV;
hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
hkernel[7] = vkernel[7] = 0;
- } else {
- // Randomly generated values for filter coefficients
+ } else if (kernel_type == 2) {
+ // Randomly generated values for filter coefficients, 7-tap kernel
hkernel[0] = hkernel[6] =
WIENER_FILT_TAP0_MINV +
rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
@@ -64,6 +64,41 @@ static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
vkernel[7] = 0;
+ } else if (kernel_type == 3) {
+ // Low possible values for filter coefficients, 5-tap kernel
+ hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
+ hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
+ hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
+ hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = vkernel[7] = 0;
+ } else if (kernel_type == 4) {
+ // Max possible values for filter coefficients, 5-tap kernel
+ hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
+ hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
+ hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
+ hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = vkernel[7] = 0;
+ } else {
+ // Randomly generated values for filter coefficients, 5-tap kernel
+ hkernel[0] = hkernel[6] = 0;
+ hkernel[1] = hkernel[5] =
+ WIENER_FILT_TAP1_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+ hkernel[2] = hkernel[4] =
+ WIENER_FILT_TAP2_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+ hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = 0;
+
+ vkernel[0] = vkernel[6] = 0;
+ vkernel[1] = vkernel[5] =
+ WIENER_FILT_TAP1_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
+ vkernel[2] = vkernel[4] =
+ WIENER_FILT_TAP2_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
+ vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+ vkernel[7] = 0;
}
}
@@ -83,19 +118,17 @@ namespace AV1HiprecConvolve {
return ::testing::ValuesIn(params);
}
-AV1HiprecConvolveTest::~AV1HiprecConvolveTest() {}
+AV1HiprecConvolveTest::~AV1HiprecConvolveTest() = default;
void AV1HiprecConvolveTest::SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
}
-void AV1HiprecConvolveTest::TearDown() {}
-
void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
const int w = 128, h = 128;
const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
const int num_iters = GET_PARAM(2);
int i, j, k, m;
- const ConvolveParams conv_params = get_conv_params_wiener(8);
+ const WienerConvolveParams conv_params = get_conv_params_wiener(8);
std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
ASSERT_NE(input_, nullptr);
@@ -114,7 +147,7 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
DECLARE_ALIGNED(16, InterpKernel, hkernel);
DECLARE_ALIGNED(16, InterpKernel, vkernel);
- for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+ for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
for (i = 0; i < num_iters; ++i) {
for (k = 0; k < h; ++k)
@@ -141,7 +174,7 @@ void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
const int num_iters = GET_PARAM(2) / 500;
int i, j, k;
- const ConvolveParams conv_params = get_conv_params_wiener(8);
+ const WienerConvolveParams conv_params = get_conv_params_wiener(8);
std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
ASSERT_NE(input_, nullptr);
@@ -217,13 +250,11 @@ namespace AV1HighbdHiprecConvolve {
return ::testing::ValuesIn(params);
}
-AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() {}
+AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() = default;
void AV1HighbdHiprecConvolveTest::SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
}
-void AV1HighbdHiprecConvolveTest::TearDown() {}
-
void AV1HighbdHiprecConvolveTest::RunCheckOutput(
highbd_hiprec_convolve_func test_impl) {
const int w = 128, h = 128;
@@ -231,7 +262,7 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
const int num_iters = GET_PARAM(2);
const int bd = GET_PARAM(3);
int i, j;
- const ConvolveParams conv_params = get_conv_params_wiener(bd);
+ const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
ASSERT_NE(input, nullptr);
@@ -255,7 +286,7 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
- for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+ for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
for (i = 0; i < num_iters; ++i) {
// Choose random locations within the source block
@@ -282,7 +313,7 @@ void AV1HighbdHiprecConvolveTest::RunSpeedTest(
const int num_iters = GET_PARAM(2) / 500;
const int bd = GET_PARAM(3);
int i, j, k;
- const ConvolveParams conv_params = get_conv_params_wiener(bd);
+ const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
ASSERT_NE(input, nullptr);
diff --git a/test/hiprec_convolve_test_util.h b/test/hiprec_convolve_test_util.h
index e064ba64a..beae5c729 100644
--- a/test/hiprec_convolve_test_util.h
+++ b/test/hiprec_convolve_test_util.h
@@ -34,7 +34,7 @@ typedef void (*hiprec_convolve_func)(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h,
- const ConvolveParams *conv_params);
+ const WienerConvolveParams *conv_params);
typedef std::tuple<int, int, int, hiprec_convolve_func> HiprecConvolveParam;
@@ -44,10 +44,8 @@ typedef std::tuple<int, int, int, hiprec_convolve_func> HiprecConvolveParam;
class AV1HiprecConvolveTest
: public ::testing::TestWithParam<HiprecConvolveParam> {
public:
- virtual ~AV1HiprecConvolveTest();
- virtual void SetUp();
-
- virtual void TearDown();
+ ~AV1HiprecConvolveTest() override;
+ void SetUp() override;
protected:
void RunCheckOutput(hiprec_convolve_func test_impl);
@@ -64,7 +62,7 @@ typedef void (*highbd_hiprec_convolve_func)(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h,
- const ConvolveParams *conv_params, int bps);
+ const WienerConvolveParams *conv_params, int bps);
typedef std::tuple<int, int, int, int, highbd_hiprec_convolve_func>
HighbdHiprecConvolveParam;
@@ -75,10 +73,8 @@ typedef std::tuple<int, int, int, int, highbd_hiprec_convolve_func>
class AV1HighbdHiprecConvolveTest
: public ::testing::TestWithParam<HighbdHiprecConvolveParam> {
public:
- virtual ~AV1HighbdHiprecConvolveTest();
- virtual void SetUp();
-
- virtual void TearDown();
+ ~AV1HighbdHiprecConvolveTest() override;
+ void SetUp() override;
protected:
void RunCheckOutput(highbd_hiprec_convolve_func test_impl);
diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index 2873490db..5e397ffdf 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc
@@ -33,12 +33,12 @@ typedef std::tuple<const HorverFunc> HorverTestParam;
class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
data_buf_ = (int16_t *)aom_malloc(MAX_SB_SQUARE * sizeof(int16_t));
ASSERT_NE(data_buf_, nullptr);
target_func_ = GET_PARAM(0);
}
- virtual void TearDown() { aom_free(data_buf_); }
+ void TearDown() override { aom_free(data_buf_); }
void RunHorverTest();
void RunHorverTest_ExtremeValues();
void RunHorverSpeedTest(int run_times);
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index cba29e9df..595ed548c 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -100,9 +100,9 @@ class HorzSuperresEndToEndTest
: EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {}
- virtual ~HorzSuperresEndToEndTest() {}
+ ~HorzSuperresEndToEndTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kTwoPassGood);
cfg_.g_lag_in_frames = 5;
cfg_.rc_end_usage = AOM_Q;
@@ -118,18 +118,18 @@ class HorzSuperresEndToEndTest
cfg_.rc_superres_mode = superres_mode_;
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
frame_count_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
frame_count_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
@@ -203,9 +203,9 @@ class HorzSuperresFixedEndToEndTest
superres_kf_denom_ = std::get<1>(denoms);
}
- virtual ~HorzSuperresFixedEndToEndTest() {}
+ ~HorzSuperresFixedEndToEndTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kTwoPassGood);
cfg_.g_lag_in_frames = 5;
cfg_.rc_end_usage = AOM_VBR;
@@ -223,18 +223,18 @@ class HorzSuperresFixedEndToEndTest
cfg_.rc_superres_kf_denominator = superres_kf_denom_;
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
frame_count_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
frame_count_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
@@ -313,9 +313,9 @@ class HorzSuperresQThreshEndToEndTest
superres_kf_qthresh_ = std::get<1>(qthresholds);
}
- virtual ~HorzSuperresQThreshEndToEndTest() {}
+ ~HorzSuperresQThreshEndToEndTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kTwoPassGood);
cfg_.g_lag_in_frames = 5;
cfg_.rc_end_usage = AOM_VBR;
@@ -333,18 +333,18 @@ class HorzSuperresQThreshEndToEndTest
cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
frame_count_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
frame_count_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
diff --git a/test/intra_edge_test.cc b/test/intra_edge_test.cc
index 84e712d1d..96ee65466 100644
--- a/test/intra_edge_test.cc
+++ b/test/intra_edge_test.cc
@@ -37,7 +37,7 @@ class UpsampleTest : public FunctionEquivalenceTest<F> {
static const int kBufSize = 2 * 64 + 32;
static const int kOffset = 16;
- virtual ~UpsampleTest() {}
+ ~UpsampleTest() override = default;
virtual void Execute(T *edge_tst) = 0;
@@ -62,16 +62,12 @@ class UpsampleTest : public FunctionEquivalenceTest<F> {
int size_;
};
-//////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
typedef void (*UP8B)(uint8_t *p, int size);
typedef libaom_test::FuncParam<UP8B> TestFuncs;
class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
protected:
- void Execute(uint8_t *edge_tst) {
+ void Execute(uint8_t *edge_tst) override {
params_.ref_func(edge_ref_, size_);
API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
}
@@ -99,6 +95,18 @@ TEST_P(UpsampleTest8B, RandomValues) {
}
}
+TEST_P(UpsampleTest8B, DISABLED_Speed) {
+ const int test_count = 10000000;
+ size_ = kMaxEdge;
+ for (int i = 0; i < kOffset + size_; ++i) {
+ edge_tst_data_[i] = rng_.Rand8();
+ }
+ edge_tst_ = &edge_tst_data_[kOffset];
+ for (int iter = 0; iter < test_count; ++iter) {
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+ }
+}
+
#if HAVE_SSE4_1
INSTANTIATE_TEST_SUITE_P(
SSE4_1, UpsampleTest8B,
@@ -106,57 +114,12 @@ INSTANTIATE_TEST_SUITE_P(
av1_upsample_intra_edge_sse4_1)));
#endif // HAVE_SSE4_1
-//////////////////////////////////////////////////////////////////////////////
-// High bit-depth version
-//////////////////////////////////////////////////////////////////////////////
-
-typedef void (*UPHB)(uint16_t *p, int size, int bd);
-typedef libaom_test::FuncParam<UPHB> TestFuncsHBD;
-
-class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
- protected:
- void Execute(uint16_t *edge_tst) {
- params_.ref_func(edge_ref_, size_, bit_depth_);
- API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
- }
- int bit_depth_;
-};
-
-TEST_P(UpsampleTestHB, RandomValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- switch (rng_(3)) {
- case 0: bit_depth_ = 8; break;
- case 1: bit_depth_ = 10; break;
- default: bit_depth_ = 12; break;
- }
- const int hi = 1 << bit_depth_;
-
- size_ = 4 * (this->rng_(4) + 1);
-
- int i, pix = 0;
- for (i = 0; i < kOffset + size_; ++i) {
- pix = rng_(hi);
- edge_ref_data_[i] = pix;
- edge_tst_data_[i] = pix;
- }
-
- // Extend final sample
- while (i < kBufSize) {
- edge_ref_data_[i] = pix;
- edge_tst_data_[i] = pix;
- i++;
- }
-
- Common();
- }
-}
-
-#if HAVE_SSE4_1
+#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(
- SSE4_1, UpsampleTestHB,
- ::testing::Values(TestFuncsHBD(av1_upsample_intra_edge_high_c,
- av1_upsample_intra_edge_high_sse4_1)));
-#endif // HAVE_SSE4_1
+ NEON, UpsampleTest8B,
+ ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
+ av1_upsample_intra_edge_neon)));
+#endif // HAVE_NEON
template <typename F, typename T>
class FilterEdgeTest : public FunctionEquivalenceTest<F> {
@@ -166,7 +129,7 @@ class FilterEdgeTest : public FunctionEquivalenceTest<F> {
static const int kBufSize = kMaxEdge + 32;
static const int kOffset = 15;
- virtual ~FilterEdgeTest() {}
+ ~FilterEdgeTest() override = default;
virtual void Execute(T *edge_tst) = 0;
@@ -191,16 +154,12 @@ class FilterEdgeTest : public FunctionEquivalenceTest<F> {
int strength_;
};
-//////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
typedef void (*FE8B)(uint8_t *p, int size, int strength);
typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
protected:
- void Execute(uint8_t *edge_tst) {
+ void Execute(uint8_t *edge_tst) override {
params_.ref_func(edge_ref_, size_, strength_);
API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
}
@@ -222,6 +181,21 @@ TEST_P(FilterEdgeTest8B, RandomValues) {
}
}
+TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
+ const int test_count = 10000000;
+ size_ = kMaxEdge;
+ strength_ = 1;
+ for (int i = 0; i < kOffset + size_; ++i) {
+ edge_tst_data_[i] = rng_.Rand8();
+ }
+ edge_tst_ = &edge_tst_data_[kOffset];
+ for (int iter = 0; iter < test_count; ++iter) {
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+ // iterate over filter strengths (1,2,3)
+ strength_ = strength_ == 3 ? 1 : strength_ + 1;
+ }
+}
+
#if HAVE_SSE4_1
INSTANTIATE_TEST_SUITE_P(
SSE4_1, FilterEdgeTest8B,
@@ -229,23 +203,28 @@ INSTANTIATE_TEST_SUITE_P(
av1_filter_intra_edge_sse4_1)));
#endif // HAVE_SSE4_1
-//////////////////////////////////////////////////////////////////////////////
-// High bit-depth version
-//////////////////////////////////////////////////////////////////////////////
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, FilterEdgeTest8B,
+ ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+ av1_filter_intra_edge_neon)));
+#endif // HAVE_NEON
-typedef void (*FEHB)(uint16_t *p, int size, int strength);
-typedef libaom_test::FuncParam<FEHB> FilterEdgeTestFuncsHBD;
+#if CONFIG_AV1_HIGHBITDEPTH
-class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
+typedef void (*UPHB)(uint16_t *p, int size, int bd);
+typedef libaom_test::FuncParam<UPHB> TestFuncsHBD;
+
+class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
protected:
- void Execute(uint16_t *edge_tst) {
- params_.ref_func(edge_ref_, size_, strength_);
- API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+ void Execute(uint16_t *edge_tst) override {
+ params_.ref_func(edge_ref_, size_, bit_depth_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
}
int bit_depth_;
};
-TEST_P(FilterEdgeTestHB, RandomValues) {
+TEST_P(UpsampleTestHB, RandomValues) {
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
switch (rng_(3)) {
case 0: bit_depth_ = 8; break;
@@ -253,8 +232,8 @@ TEST_P(FilterEdgeTestHB, RandomValues) {
default: bit_depth_ = 12; break;
}
const int hi = 1 << bit_depth_;
- strength_ = this->rng_(4);
- size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+ size_ = 4 * (this->rng_(4) + 1);
int i, pix = 0;
for (i = 0; i < kOffset + size_; ++i) {
@@ -263,28 +242,14 @@ TEST_P(FilterEdgeTestHB, RandomValues) {
edge_tst_data_[i] = pix;
}
- Common();
- }
-}
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB,
- ::testing::Values(FilterEdgeTestFuncsHBD(
- av1_filter_intra_edge_high_c,
- av1_filter_intra_edge_high_sse4_1)));
-#endif // HAVE_SSE4_1
-
-// Speed tests
+ // Extend final sample
+ while (i < kBufSize) {
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = pix;
+ i++;
+ }
-TEST_P(UpsampleTest8B, DISABLED_Speed) {
- const int test_count = 10000000;
- size_ = kMaxEdge;
- for (int i = 0; i < kOffset + size_; ++i) {
- edge_tst_data_[i] = rng_.Rand8();
- }
- edge_tst_ = &edge_tst_data_[kOffset];
- for (int iter = 0; iter < test_count; ++iter) {
- API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+ Common();
}
}
@@ -302,18 +267,51 @@ TEST_P(UpsampleTestHB, DISABLED_Speed) {
}
}
-TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
- const int test_count = 10000000;
- size_ = kMaxEdge;
- strength_ = 1;
- for (int i = 0; i < kOffset + size_; ++i) {
- edge_tst_data_[i] = rng_.Rand8();
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, UpsampleTestHB,
+ ::testing::Values(TestFuncsHBD(av1_highbd_upsample_intra_edge_c,
+ av1_highbd_upsample_intra_edge_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, UpsampleTestHB,
+ ::testing::Values(TestFuncsHBD(av1_highbd_upsample_intra_edge_c,
+ av1_highbd_upsample_intra_edge_neon)));
+#endif // HAVE_NEON
+
+typedef void (*FEHB)(uint16_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FEHB> FilterEdgeTestFuncsHBD;
+
+class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
+ protected:
+ void Execute(uint16_t *edge_tst) override {
+ params_.ref_func(edge_ref_, size_, strength_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
}
- edge_tst_ = &edge_tst_data_[kOffset];
- for (int iter = 0; iter < test_count; ++iter) {
- API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
- // iterate over filter strengths (1,2,3)
- strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+ int bit_depth_;
+};
+
+TEST_P(FilterEdgeTestHB, RandomValues) {
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ switch (rng_(3)) {
+ case 0: bit_depth_ = 8; break;
+ case 1: bit_depth_ = 10; break;
+ default: bit_depth_ = 12; break;
+ }
+ const int hi = 1 << bit_depth_;
+ strength_ = this->rng_(4);
+ size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+ int i, pix = 0;
+ for (i = 0; i < kOffset + size_; ++i) {
+ pix = rng_(hi);
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = pix;
+ }
+
+ Common();
}
}
@@ -330,8 +328,24 @@ TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
for (int iter = 0; iter < test_count; ++iter) {
API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
// iterate over filter strengths (1,2,3)
- strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+ strength_ = strength_ == 3 ? 1 : strength_ + 1;
}
}
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB,
+ ::testing::Values(FilterEdgeTestFuncsHBD(
+ av1_highbd_filter_intra_edge_c,
+ av1_highbd_filter_intra_edge_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterEdgeTestHB,
+ ::testing::Values(FilterEdgeTestFuncsHBD(
+ av1_highbd_filter_intra_edge_c,
+ av1_highbd_filter_intra_edge_neon)));
+#endif // HAVE_NEON
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
} // namespace
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index aced59385..8796e8ba6 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -155,7 +155,7 @@ class AV1IntraPredTest
}
protected:
- virtual void SetUp() {
+ void SetUp() override {
params_ = this->GetParam();
stride_ = params_.block_width * 3;
mask_ = (1 << params_.bit_depth) - 1;
@@ -195,19 +195,19 @@ class AV1IntraPredTest
#if CONFIG_AV1_HIGHBITDEPTH
class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
protected:
- void Predict() {
+ void Predict() override {
const int bit_depth = params_.bit_depth;
params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
API_REGISTER_STATE_CHECK(
params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
}
- void PredictRefSpeedTest(int num) {
+ void PredictRefSpeedTest(int num) override {
const int bit_depth = params_.bit_depth;
for (int i = 0; i < num; i++) {
params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
}
}
- void PredictFncSpeedTest(int num) {
+ void PredictFncSpeedTest(int num) override {
const int bit_depth = params_.bit_depth;
for (int i = 0; i < num; i++) {
params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth);
@@ -220,17 +220,17 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdIntraPredTest);
class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
protected:
- void Predict() {
+ void Predict() override {
params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
API_REGISTER_STATE_CHECK(
params_.pred_fn(dst_, stride_, above_row_, left_col_));
}
- void PredictRefSpeedTest(int num) {
+ void PredictRefSpeedTest(int num) override {
for (int i = 0; i < num; i++) {
params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
}
}
- void PredictFncSpeedTest(int num) {
+ void PredictFncSpeedTest(int num) override {
for (int i = 0; i < num; i++) {
params_.pred_fn(dst_, stride_, above_row_, left_col_);
}
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 63e15ca30..791cdb892 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -45,7 +45,7 @@ class InvalidFileTest : public ::libaom_test::DecoderTest,
protected:
InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {}
- virtual ~InvalidFileTest() {
+ ~InvalidFileTest() override {
if (res_file_ != nullptr) fclose(res_file_);
}
@@ -55,15 +55,14 @@ class InvalidFileTest : public ::libaom_test::DecoderTest,
<< "Result file open failed. Filename: " << res_file_name;
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- const unsigned int /*frame_number*/) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int /*frame_number*/) override {
EXPECT_NE(img.fb_priv, nullptr);
}
- virtual bool HandleDecodeResult(
- const aom_codec_err_t res_dec,
- const libaom_test::CompressedVideoSource &video,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) override {
EXPECT_NE(res_file_, nullptr);
int expected_res_dec = -1;
@@ -95,9 +94,9 @@ class InvalidFileTest : public ::libaom_test::DecoderTest,
return !HasFailure();
}
- virtual void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
- libaom_test::CompressedVideoSource * /*video*/,
- const aom_codec_err_t /*res_peek*/) {}
+ void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
+ libaom_test::CompressedVideoSource * /*video*/,
+ const aom_codec_err_t /*res_peek*/) override {}
void RunTest() {
const DecodeParam input = GET_PARAM(1);
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 45828b5bb..85731f556 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -37,20 +37,20 @@ class IVFVideoSource : public CompressedVideoSource {
compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
end_of_file_(false) {}
- virtual ~IVFVideoSource() {
+ ~IVFVideoSource() override {
delete[] compressed_frame_buf_;
if (input_file_) fclose(input_file_);
}
- virtual void Init() {
+ void Init() override {
// Allocate a buffer for read in the compressed video frame.
compressed_frame_buf_ = new uint8_t[kCodeBufferSize];
ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
}
- virtual void Begin() {
+ void Begin() override {
input_file_ = OpenTestDataFile(file_name_);
ASSERT_NE(input_file_, nullptr)
<< "Input file open failed. Filename: " << file_name_;
@@ -67,7 +67,7 @@ class IVFVideoSource : public CompressedVideoSource {
FillFrame();
}
- virtual void Next() {
+ void Next() override {
++frame_;
FillFrame();
}
@@ -94,11 +94,11 @@ class IVFVideoSource : public CompressedVideoSource {
}
}
- virtual const uint8_t *cxdata() const {
+ const uint8_t *cxdata() const override {
return end_of_file_ ? nullptr : compressed_frame_buf_;
}
- virtual size_t frame_size() const { return frame_sz_; }
- virtual unsigned int frame_number() const { return frame_; }
+ size_t frame_size() const override { return frame_sz_; }
+ unsigned int frame_number() const override { return frame_; }
protected:
std::string file_name_;
diff --git a/test/kf_test.cc b/test/kf_test.cc
index 5daf60068..bc475fda0 100644
--- a/test/kf_test.cc
+++ b/test/kf_test.cc
@@ -47,9 +47,9 @@ class KeyFrameIntervalTestLarge
kf_dist_ = -1;
is_kf_interval_violated_ = false;
}
- virtual ~KeyFrameIntervalTestLarge() {}
+ ~KeyFrameIntervalTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -60,18 +60,18 @@ class KeyFrameIntervalTestLarge
cfg_.g_lag_in_frames = 19;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -149,9 +149,9 @@ class ForcedKeyTestLarge
frame_num_ = 0;
is_kf_placement_violated_ = false;
}
- virtual ~ForcedKeyTestLarge() {}
+ ~ForcedKeyTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
cfg_.rc_end_usage = rc_end_usage_;
cfg_.g_threads = 0;
@@ -160,8 +160,8 @@ class ForcedKeyTestLarge
cfg_.fwd_kf_enabled = fwd_kf_enabled_;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, auto_alt_ref_);
@@ -176,8 +176,8 @@ class ForcedKeyTestLarge
((int)video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
if ((int)frame_num_ == forced_kf_frame_num_) {
diff --git a/test/level_test.cc b/test/level_test.cc
index cc799261f..a7c26d230 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -40,9 +40,9 @@ class LevelTest
: EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
cpu_used_(GET_PARAM(2)), target_level_(31) {}
- virtual ~LevelTest() {}
+ ~LevelTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
if (encoding_mode_ != ::libaom_test::kRealTime) {
cfg_.g_lag_in_frames = 5;
@@ -53,8 +53,8 @@ class LevelTest
}
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_);
diff --git a/test/loopfilter_control_test.cc b/test/loopfilter_control_test.cc
index 5f0134045..9c00235e1 100644
--- a/test/loopfilter_control_test.cc
+++ b/test/loopfilter_control_test.cc
@@ -79,9 +79,9 @@ class LFControlEndToEndTest
aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
tile_columns_(GET_PARAM(5)) {}
- virtual ~LFControlEndToEndTest() {}
+ ~LFControlEndToEndTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kRealTime);
cfg_.g_threads = threads_;
@@ -92,18 +92,18 @@ class LFControlEndToEndTest
cfg_.kf_min_dist = 9999;
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
diff --git a/test/lossless_test.cc b/test/lossless_test.cc
index ef4e19ff0..756ad0501 100644
--- a/test/lossless_test.cc
+++ b/test/lossless_test.cc
@@ -33,15 +33,15 @@ class LosslessTestLarge
encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)),
cpu_used_(GET_PARAM(3)) {}
- virtual ~LosslessTestLarge() {}
+ ~LosslessTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
cfg_.rc_end_usage = rc_end_usage_;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
// Only call Control if quantizer > 0 to verify that using quantizer
// alone will activate lossless
@@ -52,19 +52,19 @@ class LosslessTestLarge
}
}
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
psnr_ = kMaxPsnr;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
}
double GetMinPsnr() const { return psnr_; }
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 421fdef52..04b1c86d4 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -127,16 +127,14 @@ uint8_t GetHevThresh(ACMRandom *rnd) {
template <typename func_type_t, typename params_t>
class LoopTestParam : public ::testing::TestWithParam<params_t> {
public:
- virtual ~LoopTestParam() {}
- virtual void SetUp() {
+ ~LoopTestParam() override = default;
+ void SetUp() override {
loopfilter_op_ = std::get<0>(this->GetParam());
ref_loopfilter_op_ = std::get<1>(this->GetParam());
bit_depth_ = std::get<2>(this->GetParam());
mask_ = (1 << bit_depth_) - 1;
}
- virtual void TearDown() {}
-
protected:
int bit_depth_;
int mask_;
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 2ef3e4ddd..bb037460d 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -45,8 +45,8 @@ typedef std::tuple<MaskedSADx4Func, MaskedSADx4Func> MaskedSADx4Param;
class MaskedSADTestBase : public ::testing::Test {
public:
- virtual ~MaskedSADTestBase() {}
- virtual void SetUp() = 0;
+ ~MaskedSADTestBase() override = default;
+ void SetUp() override = 0;
virtual void runRef(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr[], int ref_stride,
const uint8_t *second_pred, const uint8_t *msk,
@@ -58,28 +58,26 @@ class MaskedSADTestBase : public ::testing::Test {
int msk_stride, int inv_mask, unsigned sads[],
int times) = 0;
- virtual void TearDown() {}
void runMaskedSADTest(int run_times);
};
class MaskedSADTest : public MaskedSADTestBase,
public ::testing::WithParamInterface<MaskedSADParam> {
public:
- virtual ~MaskedSADTest() {}
- virtual void SetUp() {
+ ~MaskedSADTest() override = default;
+ void SetUp() override {
maskedSAD_op_ = GET_PARAM(0);
ref_maskedSAD_op_ = GET_PARAM(1);
}
- virtual void runRef(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr[], int ref_stride,
- const uint8_t *second_pred, const uint8_t *msk,
- int msk_stride, int inv_mask, unsigned sads[], int times);
- virtual void runTest(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr[], int ref_stride,
- const uint8_t *second_pred, const uint8_t *msk,
- int msk_stride, int inv_mask, unsigned sads[],
- int times);
+ void runRef(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
+ void runTest(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
protected:
MaskedSADFunc maskedSAD_op_;
@@ -90,20 +88,19 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADTest);
class MaskedSADx4Test : public MaskedSADTestBase,
public ::testing::WithParamInterface<MaskedSADx4Param> {
public:
- virtual ~MaskedSADx4Test() {}
- virtual void SetUp() {
+ ~MaskedSADx4Test() override = default;
+ void SetUp() override {
maskedSAD_op_ = GET_PARAM(0);
ref_maskedSAD_op_ = GET_PARAM(1);
}
- virtual void runRef(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr[], int ref_stride,
- const uint8_t *second_pred, const uint8_t *msk,
- int msk_stride, int inv_mask, unsigned sads[], int times);
- virtual void runTest(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr[], int ref_stride,
- const uint8_t *second_pred, const uint8_t *msk,
- int msk_stride, int inv_mask, unsigned sads[],
- int times);
+ void runRef(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
+ void runTest(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
protected:
MaskedSADx4Func maskedSAD_op_;
@@ -264,13 +261,12 @@ typedef std::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
class HighbdMaskedSADTest
: public ::testing::TestWithParam<HighbdMaskedSADParam> {
public:
- virtual ~HighbdMaskedSADTest() {}
- virtual void SetUp() {
+ ~HighbdMaskedSADTest() override = default;
+ void SetUp() override {
maskedSAD_op_ = GET_PARAM(0);
ref_maskedSAD_op_ = GET_PARAM(1);
}
- virtual void TearDown() {}
void runHighbdMaskedSADTest(int run_times);
protected:
@@ -581,6 +577,41 @@ const MaskedSADx4Param msadx4_test[] = {
INSTANTIATE_TEST_SUITE_P(NEON, MaskedSADx4Test,
::testing::ValuesIn(msadx4_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const MaskedSADParam hbd_msad_neon_test[] = {
+ make_tuple(&aom_highbd_masked_sad4x4_neon, &aom_highbd_masked_sad4x4_c),
+ make_tuple(&aom_highbd_masked_sad4x8_neon, &aom_highbd_masked_sad4x8_c),
+ make_tuple(&aom_highbd_masked_sad8x4_neon, &aom_highbd_masked_sad8x4_c),
+ make_tuple(&aom_highbd_masked_sad8x8_neon, &aom_highbd_masked_sad8x8_c),
+ make_tuple(&aom_highbd_masked_sad8x16_neon, &aom_highbd_masked_sad8x16_c),
+ make_tuple(&aom_highbd_masked_sad16x8_neon, &aom_highbd_masked_sad16x8_c),
+ make_tuple(&aom_highbd_masked_sad16x16_neon, &aom_highbd_masked_sad16x16_c),
+ make_tuple(&aom_highbd_masked_sad16x32_neon, &aom_highbd_masked_sad16x32_c),
+ make_tuple(&aom_highbd_masked_sad32x16_neon, &aom_highbd_masked_sad32x16_c),
+ make_tuple(&aom_highbd_masked_sad32x32_neon, &aom_highbd_masked_sad32x32_c),
+ make_tuple(&aom_highbd_masked_sad32x64_neon, &aom_highbd_masked_sad32x64_c),
+ make_tuple(&aom_highbd_masked_sad64x32_neon, &aom_highbd_masked_sad64x32_c),
+ make_tuple(&aom_highbd_masked_sad64x64_neon, &aom_highbd_masked_sad64x64_c),
+ make_tuple(&aom_highbd_masked_sad64x128_neon, &aom_highbd_masked_sad64x128_c),
+ make_tuple(&aom_highbd_masked_sad128x64_neon, &aom_highbd_masked_sad128x64_c),
+ make_tuple(&aom_highbd_masked_sad128x128_neon,
+ &aom_highbd_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_masked_sad4x16_neon, &aom_highbd_masked_sad4x16_c),
+ make_tuple(&aom_highbd_masked_sad16x4_neon, &aom_highbd_masked_sad16x4_c),
+ make_tuple(&aom_highbd_masked_sad8x32_neon, &aom_highbd_masked_sad8x32_c),
+ make_tuple(&aom_highbd_masked_sad32x8_neon, &aom_highbd_masked_sad32x8_c),
+ make_tuple(&aom_highbd_masked_sad16x64_neon, &aom_highbd_masked_sad16x64_c),
+ make_tuple(&aom_highbd_masked_sad64x16_neon, &aom_highbd_masked_sad64x16_c),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdMaskedSADTest,
+ ::testing::ValuesIn(hbd_msad_neon_test));
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
#endif // HAVE_NEON
} // namespace
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index e76403ea7..8482a12f5 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -43,14 +43,12 @@ typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
class MaskedSubPixelVarianceTest
: public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
public:
- virtual ~MaskedSubPixelVarianceTest() {}
- virtual void SetUp() {
+ ~MaskedSubPixelVarianceTest() override = default;
+ void SetUp() override {
opt_func_ = GET_PARAM(0);
ref_func_ = GET_PARAM(1);
}
- virtual void TearDown() {}
-
protected:
MaskedSubPixelVarianceFunc opt_func_;
MaskedSubPixelVarianceFunc ref_func_;
@@ -179,15 +177,13 @@ typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
class HighbdMaskedSubPixelVarianceTest
: public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
public:
- virtual ~HighbdMaskedSubPixelVarianceTest() {}
- virtual void SetUp() {
+ ~HighbdMaskedSubPixelVarianceTest() override = default;
+ void SetUp() override {
opt_func_ = GET_PARAM(0);
ref_func_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
}
- virtual void TearDown() {}
-
protected:
MaskedSubPixelVarianceFunc opt_func_;
MaskedSubPixelVarianceFunc ref_func_;
@@ -568,5 +564,149 @@ const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
INSTANTIATE_TEST_SUITE_P(NEON_C_COMPARE, MaskedSubPixelVarianceTest,
::testing::ValuesIn(sub_pel_var_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test_neon[] = {
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_neon,
+ &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x4_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_neon,
+ &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_neon,
+ &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_neon,
+ &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_neon,
+ &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_neon,
+ &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x4_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance4x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x4_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance4x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x4_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+ ::testing::ValuesIn(hbd_sub_pel_var_test_neon));
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
#endif // HAVE_NEON
} // namespace
diff --git a/test/metadata_test.cc b/test/metadata_test.cc
index e77e5d108..9467c29e8 100644
--- a/test/metadata_test.cc
+++ b/test/metadata_test.cc
@@ -56,12 +56,12 @@ class MetadataEncodeTest
protected:
MetadataEncodeTest() : EncoderTest(GET_PARAM(0)) {}
- virtual ~MetadataEncodeTest() {}
+ ~MetadataEncodeTest() override = default;
- virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder * /*encoder*/) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder * /*encoder*/) override {
aom_image_t *current_frame = video->img();
if (current_frame) {
if (current_frame->metadata) aom_img_remove_metadata(current_frame);
@@ -95,7 +95,7 @@ class MetadataEncodeTest
}
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
const size_t bitstream_size = pkt->data.frame.sz;
const uint8_t *bitstream =
@@ -138,8 +138,8 @@ class MetadataEncodeTest
}
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t /*pts*/) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t /*pts*/) override {
ASSERT_NE(img.metadata, nullptr);
ASSERT_EQ(img.metadata->sz, 3u);
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index cf67b7b92..33be4ff6d 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -31,7 +31,7 @@ typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
public:
- virtual void SetUp() {
+ void SetUp() override {
mm_func_ = GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
}
diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index c5229fc1d..f22b5fe0f 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc
@@ -42,12 +42,12 @@ class MonochromeTest
: EncoderTest(GET_PARAM(0)), lossless_(GET_PARAM(2)),
frame0_psnr_y_(0.0) {}
- virtual ~MonochromeTest() {}
+ ~MonochromeTest() override = default;
- virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, GET_PARAM(3));
if (mode_ == ::libaom_test::kAllIntra) {
@@ -59,8 +59,8 @@ class MonochromeTest
}
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t pts) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
(void)pts;
// Get value of top-left corner pixel of U plane
@@ -96,7 +96,7 @@ class MonochromeTest
return true;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
// Check average PSNR value is >= 100 db in case of lossless encoding.
if (lossless_) {
EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr);
diff --git a/test/motion_vector_test.cc b/test/motion_vector_test.cc
index bf10edefa..4fc8d53d9 100644
--- a/test/motion_vector_test.cc
+++ b/test/motion_vector_test.cc
@@ -43,9 +43,9 @@ class MotionVectorTestLarge
: EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
- virtual ~MotionVectorTestLarge() {}
+ ~MotionVectorTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
if (encoding_mode_ != ::libaom_test::kRealTime) {
cfg_.g_lag_in_frames = 3;
@@ -56,8 +56,8 @@ class MotionVectorTestLarge
}
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
encoder->Control(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
diff --git a/test/mv_cost_test.cc b/test/mv_cost_test.cc
index 86e310cf1..73d56665b 100644
--- a/test/mv_cost_test.cc
+++ b/test/mv_cost_test.cc
@@ -23,18 +23,19 @@ void ReferenceBuildNmvComponentCostTable(int *mvcost,
int bits_cost[MV_OFFSET_BITS][2];
int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
int class0_hp_cost[2], hp_cost[2];
- av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
- av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
- av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+ av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, nullptr);
+ av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, nullptr);
+ av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, nullptr);
for (i = 0; i < MV_OFFSET_BITS; ++i) {
- av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+ av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], nullptr);
}
for (i = 0; i < CLASS0_SIZE; ++i)
- av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
- av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+ av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i],
+ nullptr);
+ av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, nullptr);
if (precision > MV_SUBPEL_LOW_PRECISION) {
- av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
- av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+ av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, nullptr);
+ av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, nullptr);
}
mvcost[0] = 0;
for (v = 1; v <= MV_MAX; ++v) {
diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index 650af79d1..b3edcc218 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -24,7 +24,7 @@ namespace {
// Return normally distrbuted values with standard deviation of sigma.
double randn(libaom_test::ACMRandom *random, double sigma) {
- while (1) {
+ while (true) {
const double u = 2.0 * ((double)random->Rand31() /
testing::internal::Random::kMaxRange) -
1.0;
@@ -367,7 +367,7 @@ struct BitDepthParams {
template <typename T>
class FlatBlockEstimatorTest : public ::testing::Test, public T {
public:
- virtual void SetUp() { random_.Reset(171); }
+ void SetUp() override { random_.Reset(171); }
typedef std::vector<typename T::data_type_t> VecType;
VecType data_;
libaom_test::ACMRandom random_;
@@ -544,7 +544,7 @@ class NoiseModelUpdateTest : public ::testing::Test, public T {
static const int kNumBlocksX = kWidth / kBlockSize;
static const int kNumBlocksY = kHeight / kBlockSize;
- virtual void SetUp() {
+ void SetUp() override {
const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
T::kBitDepth, T::kUseHighBD };
ASSERT_TRUE(aom_noise_model_init(&model_, params));
@@ -576,7 +576,7 @@ class NoiseModelUpdateTest : public ::testing::Test, public T {
&flat_blocks_[0], block_size);
}
- void TearDown() { aom_noise_model_free(&model_); }
+ void TearDown() override { aom_noise_model_free(&model_); }
protected:
aom_noise_model_t model_;
@@ -1186,7 +1186,7 @@ class WienerDenoiseTest : public ::testing::Test, public T {
static void SetUpTestSuite() { aom_dsp_rtcd(); }
protected:
- void SetUp() {
+ void SetUp() override {
static const float kNoiseLevel = 5.f;
static const float kStd = 4.0;
static const double kMaxValue = (1 << T::kBitDepth) - 1;
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index 8d13ac159..967b67766 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc
@@ -236,6 +236,38 @@ TEST_P(ObmcSadHBDTest, ExtremeValues) {
}
}
+#if HAVE_NEON
+ObmcSadHBDTest::ParamType neon_functions_hbd[] = {
+ TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_neon),
+ TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_neon),
+ TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_neon),
+ TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_neon),
+ TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_neon),
+ TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_neon),
+ TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_neon),
+ TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_neon),
+ TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_neon),
+ TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_neon),
+ TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_neon),
+ TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_neon),
+ TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_neon),
+ TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_neon),
+ TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_neon),
+ TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_neon),
+#if !CONFIG_REALTIME_ONLY
+ TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_neon),
+ TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_neon),
+ TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_neon),
+ TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_neon),
+ TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_neon),
+ TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_neon),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcSadHBDTest,
+ ::testing::ValuesIn(neon_functions_hbd));
+#endif // HAVE_NEON
+
#if HAVE_SSE4_1
ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1),
diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index b2bf42a04..5f21a8a6c 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc
@@ -228,7 +228,7 @@ INSTANTIATE_TEST_SUITE_P(NEON, ObmcVarianceTest,
////////////////////////////////////////////////////////////////////////////////
// High bit-depth
////////////////////////////////////////////////////////////////////////////////
-#if CONFIG_AV1_HIGHBITDEPTH
+#if CONFIG_AV1_HIGHBITDEPTH && !CONFIG_REALTIME_ONLY
class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceHBDTest);
@@ -287,41 +287,181 @@ TEST_P(ObmcVarianceHBDTest, ExtremeValues) {
}
}
-#if HAVE_SSE4_1
-ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
- TestFuncs(aom_highbd_obmc_variance128x128_c,
- aom_highbd_obmc_variance128x128_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance128x64_c,
- aom_highbd_obmc_variance128x64_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance64x128_c,
- aom_highbd_obmc_variance64x128_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance64x64_c,
- aom_highbd_obmc_variance64x64_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance64x32_c,
- aom_highbd_obmc_variance64x32_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance32x64_c,
- aom_highbd_obmc_variance32x64_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance32x32_c,
- aom_highbd_obmc_variance32x32_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance32x16_c,
- aom_highbd_obmc_variance32x16_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance16x32_c,
- aom_highbd_obmc_variance16x32_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance16x16_c,
- aom_highbd_obmc_variance16x16_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance16x8_c, aom_highbd_obmc_variance16x8_sse4_1,
- 8),
- TestFuncs(aom_highbd_obmc_variance8x16_c, aom_highbd_obmc_variance8x16_sse4_1,
- 8),
- TestFuncs(aom_highbd_obmc_variance8x8_c, aom_highbd_obmc_variance8x8_sse4_1,
+#if HAVE_NEON
+ObmcVarianceHBDTest::ParamType neon_functions_hbd[] = {
+ TestFuncs(aom_highbd_8_obmc_variance128x128_c,
+ aom_highbd_8_obmc_variance128x128_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance128x64_c,
+ aom_highbd_8_obmc_variance128x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x128_c,
+ aom_highbd_8_obmc_variance64x128_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x64_c,
+ aom_highbd_8_obmc_variance64x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x32_c,
+ aom_highbd_8_obmc_variance64x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x64_c,
+ aom_highbd_8_obmc_variance32x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x32_c,
+ aom_highbd_8_obmc_variance32x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x16_c,
+ aom_highbd_8_obmc_variance32x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x32_c,
+ aom_highbd_8_obmc_variance16x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x16_c,
+ aom_highbd_8_obmc_variance16x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x8_c,
+ aom_highbd_8_obmc_variance16x8_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x16_c,
+ aom_highbd_8_obmc_variance8x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x8_c, aom_highbd_8_obmc_variance8x8_neon,
8),
- TestFuncs(aom_highbd_obmc_variance8x4_c, aom_highbd_obmc_variance8x4_sse4_1,
+ TestFuncs(aom_highbd_8_obmc_variance8x4_c, aom_highbd_8_obmc_variance8x4_neon,
8),
- TestFuncs(aom_highbd_obmc_variance4x8_c, aom_highbd_obmc_variance4x8_sse4_1,
+ TestFuncs(aom_highbd_8_obmc_variance4x8_c, aom_highbd_8_obmc_variance4x8_neon,
8),
- TestFuncs(aom_highbd_obmc_variance4x4_c, aom_highbd_obmc_variance4x4_sse4_1,
+ TestFuncs(aom_highbd_8_obmc_variance4x4_c, aom_highbd_8_obmc_variance4x4_neon,
8),
TestFuncs(aom_highbd_10_obmc_variance128x128_c,
+ aom_highbd_10_obmc_variance128x128_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance128x64_c,
+ aom_highbd_10_obmc_variance128x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x128_c,
+ aom_highbd_10_obmc_variance64x128_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x64_c,
+ aom_highbd_10_obmc_variance64x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x32_c,
+ aom_highbd_10_obmc_variance64x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x64_c,
+ aom_highbd_10_obmc_variance32x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x32_c,
+ aom_highbd_10_obmc_variance32x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x16_c,
+ aom_highbd_10_obmc_variance32x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x32_c,
+ aom_highbd_10_obmc_variance16x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x16_c,
+ aom_highbd_10_obmc_variance16x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x8_c,
+ aom_highbd_10_obmc_variance16x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x16_c,
+ aom_highbd_10_obmc_variance8x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x8_c,
+ aom_highbd_10_obmc_variance8x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x4_c,
+ aom_highbd_10_obmc_variance8x4_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x8_c,
+ aom_highbd_10_obmc_variance4x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x4_c,
+ aom_highbd_10_obmc_variance4x4_neon, 10),
+ TestFuncs(aom_highbd_12_obmc_variance128x128_c,
+ aom_highbd_12_obmc_variance128x128_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance128x64_c,
+ aom_highbd_12_obmc_variance128x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x128_c,
+ aom_highbd_12_obmc_variance64x128_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x64_c,
+ aom_highbd_12_obmc_variance64x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x32_c,
+ aom_highbd_12_obmc_variance64x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x64_c,
+ aom_highbd_12_obmc_variance32x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x32_c,
+ aom_highbd_12_obmc_variance32x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x16_c,
+ aom_highbd_12_obmc_variance32x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x32_c,
+ aom_highbd_12_obmc_variance16x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x16_c,
+ aom_highbd_12_obmc_variance16x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x8_c,
+ aom_highbd_12_obmc_variance16x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x16_c,
+ aom_highbd_12_obmc_variance8x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x8_c,
+ aom_highbd_12_obmc_variance8x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x4_c,
+ aom_highbd_12_obmc_variance8x4_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x8_c,
+ aom_highbd_12_obmc_variance4x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x4_c,
+ aom_highbd_12_obmc_variance4x4_neon, 12),
+ TestFuncs(aom_highbd_8_obmc_variance64x16_c,
+ aom_highbd_8_obmc_variance64x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x64_c,
+ aom_highbd_8_obmc_variance16x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x8_c,
+ aom_highbd_8_obmc_variance32x8_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x32_c,
+ aom_highbd_8_obmc_variance8x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x4_c,
+ aom_highbd_8_obmc_variance16x4_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x16_c,
+ aom_highbd_8_obmc_variance4x16_neon, 8),
+ TestFuncs(aom_highbd_10_obmc_variance64x16_c,
+ aom_highbd_10_obmc_variance64x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x64_c,
+ aom_highbd_10_obmc_variance16x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x8_c,
+ aom_highbd_10_obmc_variance32x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x32_c,
+ aom_highbd_10_obmc_variance8x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x4_c,
+ aom_highbd_10_obmc_variance16x4_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x16_c,
+ aom_highbd_10_obmc_variance4x16_neon, 10),
+ TestFuncs(aom_highbd_12_obmc_variance64x16_c,
+ aom_highbd_12_obmc_variance64x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x64_c,
+ aom_highbd_12_obmc_variance16x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x8_c,
+ aom_highbd_12_obmc_variance32x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x32_c,
+ aom_highbd_12_obmc_variance8x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x4_c,
+ aom_highbd_12_obmc_variance16x4_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x16_c,
+ aom_highbd_12_obmc_variance4x16_neon, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcVarianceHBDTest,
+ ::testing::ValuesIn(neon_functions_hbd));
+#endif // HAVE_NEON
+
+#if HAVE_SSE4_1
+ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
+ TestFuncs(aom_highbd_8_obmc_variance128x128_c,
+ aom_highbd_8_obmc_variance128x128_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance128x64_c,
+ aom_highbd_8_obmc_variance128x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x128_c,
+ aom_highbd_8_obmc_variance64x128_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x64_c,
+ aom_highbd_8_obmc_variance64x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x32_c,
+ aom_highbd_8_obmc_variance64x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x64_c,
+ aom_highbd_8_obmc_variance32x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x32_c,
+ aom_highbd_8_obmc_variance32x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x16_c,
+ aom_highbd_8_obmc_variance32x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x32_c,
+ aom_highbd_8_obmc_variance16x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x16_c,
+ aom_highbd_8_obmc_variance16x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x8_c,
+ aom_highbd_8_obmc_variance16x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x16_c,
+ aom_highbd_8_obmc_variance8x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x8_c,
+ aom_highbd_8_obmc_variance8x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x4_c,
+ aom_highbd_8_obmc_variance8x4_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x8_c,
+ aom_highbd_8_obmc_variance4x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x4_c,
+ aom_highbd_8_obmc_variance4x4_sse4_1, 8),
+ TestFuncs(aom_highbd_10_obmc_variance128x128_c,
aom_highbd_10_obmc_variance128x128_sse4_1, 10),
TestFuncs(aom_highbd_10_obmc_variance128x64_c,
aom_highbd_10_obmc_variance128x64_sse4_1, 10),
@@ -386,18 +526,18 @@ ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
TestFuncs(aom_highbd_12_obmc_variance4x4_c,
aom_highbd_12_obmc_variance4x4_sse4_1, 12),
- TestFuncs(aom_highbd_obmc_variance64x16_c,
- aom_highbd_obmc_variance64x16_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance16x64_c,
- aom_highbd_obmc_variance16x64_sse4_1, 8),
- TestFuncs(aom_highbd_obmc_variance32x8_c, aom_highbd_obmc_variance32x8_sse4_1,
- 8),
- TestFuncs(aom_highbd_obmc_variance8x32_c, aom_highbd_obmc_variance8x32_sse4_1,
- 8),
- TestFuncs(aom_highbd_obmc_variance16x4_c, aom_highbd_obmc_variance16x4_sse4_1,
- 8),
- TestFuncs(aom_highbd_obmc_variance4x16_c, aom_highbd_obmc_variance4x16_sse4_1,
- 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x16_c,
+ aom_highbd_8_obmc_variance64x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x64_c,
+ aom_highbd_8_obmc_variance16x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x8_c,
+ aom_highbd_8_obmc_variance32x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x32_c,
+ aom_highbd_8_obmc_variance8x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x4_c,
+ aom_highbd_8_obmc_variance16x4_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x16_c,
+ aom_highbd_8_obmc_variance4x16_sse4_1, 8),
TestFuncs(aom_highbd_10_obmc_variance64x16_c,
aom_highbd_10_obmc_variance64x16_sse4_1, 10),
TestFuncs(aom_highbd_10_obmc_variance16x64_c,
@@ -427,5 +567,5 @@ ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceHBDTest,
::testing::ValuesIn(sse4_functions_hbd));
#endif // HAVE_SSE4_1
-#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // CONFIG_AV1_HIGHBITDEPTH && !CONFIG_REALTIME_ONLY
} // namespace
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index 131e1dd5c..534d9b1c8 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc
@@ -43,7 +43,7 @@ typedef std::tuple<const lowbd_pixel_proj_error_func> PixelProjErrorTestParam;
class PixelProjErrorTest
: public ::testing::TestWithParam<PixelProjErrorTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
target_func_ = GET_PARAM(0);
src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
sizeof(*src_)));
@@ -58,7 +58,7 @@ class PixelProjErrorTest
sizeof(*flt1_)));
ASSERT_NE(flt1_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(dgd_);
aom_free(flt0_);
@@ -215,7 +215,7 @@ typedef std::tuple<const highbd_pixel_proj_error_func> PixelProjErrorTestParam;
class PixelProjHighbdErrorTest
: public ::testing::TestWithParam<PixelProjErrorTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
target_func_ = GET_PARAM(0);
src_ =
(uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_));
@@ -230,7 +230,7 @@ class PixelProjHighbdErrorTest
(int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt1_));
ASSERT_NE(flt1_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(dgd_);
aom_free(flt0_);
@@ -386,7 +386,7 @@ typedef std::tuple<const set_get_proj_subspace> GetProjSubspaceTestParam;
class GetProjSubspaceTest
: public ::testing::TestWithParam<GetProjSubspaceTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
target_func_ = GET_PARAM(0);
src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
sizeof(*src_)));
@@ -401,7 +401,7 @@ class GetProjSubspaceTest
sizeof(*flt1_)));
ASSERT_NE(flt1_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(dgd_);
aom_free(flt0_);
@@ -432,7 +432,9 @@ void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
const int flt0_stride = MAX_DATA_BLOCK;
const int flt1_stride = MAX_DATA_BLOCK;
sgr_params_type params;
- const int iters = run_times == 1 ? kIterations : 4;
+ const int iters = run_times == 1 ? kIterations : 3;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -444,10 +446,8 @@ void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
flt1_[i] = rng_.Rand15Signed();
}
- params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
- params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
- params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
- params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR0[iter];
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR1[iter];
uint8_t *dgd = dgd_;
uint8_t *src = src_;
@@ -492,6 +492,8 @@ void GetProjSubspaceTest::RunGetProjSubspaceTest_ExtremeValues() {
const int flt1_stride = MAX_DATA_BLOCK;
sgr_params_type params;
const int iters = kIterations;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -502,10 +504,8 @@ void GetProjSubspaceTest::RunGetProjSubspaceTest_ExtremeValues() {
flt0_[i] = rng_.Rand15Signed();
flt1_[i] = rng_.Rand15Signed();
}
- params.r[0] = 1;
- params.r[1] = 1;
- params.s[0] = rng_.Rand8() % MAX_RADIUS;
- params.s[1] = rng_.Rand8() % MAX_RADIUS;
+ params.r[0] = kR0[iter % 3];
+ params.r[1] = kR1[iter % 3];
uint8_t *dgd = dgd_;
uint8_t *src = src_;
@@ -546,6 +546,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
::testing::Values(av1_calc_proj_params_avx2));
#endif // HAVE_AVX2
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTest,
+ ::testing::Values(av1_calc_proj_params_neon));
+#endif // HAVE_NEON
+
} // namespace get_proj_subspace_test_lowbd
#if CONFIG_AV1_HIGHBITDEPTH
@@ -565,7 +571,7 @@ typedef std::tuple<const set_get_proj_subspace_hbd> GetProjSubspaceHBDTestParam;
class GetProjSubspaceTestHBD
: public ::testing::TestWithParam<GetProjSubspaceHBDTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
target_func_ = GET_PARAM(0);
src_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
sizeof(*src_)));
@@ -580,7 +586,7 @@ class GetProjSubspaceTestHBD
sizeof(*flt1_)));
ASSERT_NE(flt1_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(dgd_);
aom_free(flt0_);
@@ -611,7 +617,9 @@ void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD(int32_t run_times) {
const int flt0_stride = MAX_DATA_BLOCK;
const int flt1_stride = MAX_DATA_BLOCK;
sgr_params_type params;
- const int iters = run_times == 1 ? kIterations : 4;
+ const int iters = run_times == 1 ? kIterations : 3;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -623,10 +631,8 @@ void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD(int32_t run_times) {
flt1_[i] = rng_.Rand15Signed();
}
- params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
- params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
- params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
- params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR0[iter];
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR1[iter];
uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
uint8_t *src = CONVERT_TO_BYTEPTR(src_);
@@ -671,6 +677,8 @@ void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD_ExtremeValues() {
const int flt1_stride = MAX_DATA_BLOCK;
sgr_params_type params;
const int iters = kIterations;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -681,10 +689,8 @@ void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD_ExtremeValues() {
flt0_[i] = rng_.Rand15Signed();
flt1_[i] = rng_.Rand15Signed();
}
- params.r[0] = 1;
- params.r[1] = 1;
- params.s[0] = rng_.Rand8() % MAX_RADIUS;
- params.s[1] = rng_.Rand8() % MAX_RADIUS;
+ params.r[0] = kR0[iter % 3];
+ params.r[1] = kR1[iter % 3];
uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
uint8_t *src = CONVERT_TO_BYTEPTR(src_);
@@ -728,6 +734,11 @@ INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestHBD,
::testing::Values(av1_calc_proj_params_high_bd_avx2));
#endif // HAVE_AVX2
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTestHBD,
+ ::testing::Values(av1_calc_proj_params_high_bd_neon));
+#endif // HAVE_NEON
} // namespace get_proj_subspace_test_hbd
#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/postproc_filters_test.cc b/test/postproc_filters_test.cc
index 37de5d2cb..9584dd8c3 100644
--- a/test/postproc_filters_test.cc
+++ b/test/postproc_filters_test.cc
@@ -30,13 +30,13 @@ class PostprocFiltersTest
: EncoderTest(GET_PARAM(0)), set_skip_postproc_filtering_(false),
frame_number_(0), cpu_used_(GET_PARAM(1)), bd_(GET_PARAM(2)) {}
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kAllIntra);
cfg_.g_input_bit_depth = bd_;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
frame_number_ = video->frame();
if (frame_number_ == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
@@ -53,14 +53,14 @@ class PostprocFiltersTest
}
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
::libaom_test::MD5 md5_enc;
md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
pkt->data.frame.sz);
md5_enc_.push_back(md5_enc.Get());
}
- virtual void PostEncodeFrameHook(::libaom_test::Encoder *encoder) {
+ void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
const aom_image_t *img_enc = encoder->GetPreviewFrame();
if (!set_skip_postproc_filtering_) {
ASSERT_NE(img_enc, nullptr);
diff --git a/test/quant_test.cc b/test/quant_test.cc
index a042af13e..afbabb314 100644
--- a/test/quant_test.cc
+++ b/test/quant_test.cc
@@ -32,15 +32,15 @@ class QMTest
public ::libaom_test::EncoderTest {
protected:
QMTest() : EncoderTest(GET_PARAM(0)) {}
- virtual ~QMTest() {}
+ ~QMTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
set_cpu_used_ = GET_PARAM(2);
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
encoder->Control(AV1E_SET_ENABLE_QM, 1);
@@ -119,9 +119,9 @@ class QuantizerBoundsCheckTestLarge
quant_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
quant_bound_violated_ = false;
}
- virtual ~QuantizerBoundsCheckTestLarge() {}
+ ~QuantizerBoundsCheckTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -135,17 +135,17 @@ class QuantizerBoundsCheckTestLarge
}
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 04e830682..328d5b10d 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -100,9 +100,9 @@ class QuantizeTestBase
tx_size_(GET_TEMPLATE_PARAM(2)), type_(GET_TEMPLATE_PARAM(3)),
bd_(GET_TEMPLATE_PARAM(4)) {}
- virtual ~QuantizeTestBase() {}
+ ~QuantizeTestBase() override = default;
- virtual void SetUp() {
+ void SetUp() override {
qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
ASSERT_NE(qtab_, nullptr);
const int n_coeffs = coeff_num();
@@ -112,7 +112,7 @@ class QuantizeTestBase
InitQuantizer();
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(qtab_);
qtab_ = nullptr;
aom_free(coeff_);
@@ -149,8 +149,8 @@ class QuantizeTestBase
// Testing uses luminance quantization table
const int16_t *zbin = qtab_->quant.y_zbin[q];
- const int16_t *round = 0;
- const int16_t *quant = 0;
+ const int16_t *round = nullptr;
+ const int16_t *quant = nullptr;
if (type_ == TYPE_B) {
round = qtab_->quant.y_round[q];
quant = qtab_->quant.y_quant[q];
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 0d8d48f46..cc054b692 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -36,12 +36,14 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
RcInterfaceTest()
: EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
encoder_exit_(false), layer_frame_cnt_(0), superframe_cnt_(0),
- dynamic_temporal_layers_(false), dynamic_spatial_layers_(false) {
+ frame_cnt_(0), dynamic_temporal_layers_(false),
+ dynamic_spatial_layers_(false), num_drops_(0), max_consec_drop_(0),
+ frame_drop_thresh_(0) {
memset(&svc_params_, 0, sizeof(svc_params_));
memset(&layer_id_, 0, sizeof(layer_id_));
}
- ~RcInterfaceTest() override {}
+ ~RcInterfaceTest() override = default;
protected:
void SetUp() override { InitializeConfig(::libaom_test::kRealTime); }
@@ -57,10 +59,15 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
if (video->frame() == 0 && layer_frame_cnt_ == 0) {
encoder->Control(AOME_SET_CPUUSED, 7);
encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
- encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ if (rc_cfg_.is_screen) {
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ } else {
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ }
encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT,
rc_cfg_.max_intra_bitrate_pct);
if (use_svc) encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ encoder->Control(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, max_consec_drop_);
}
// SVC specific settings
if (use_svc) {
@@ -140,20 +147,24 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
return;
}
layer_frame_cnt_++;
+ frame_cnt_++;
if (layer_id_.spatial_layer_id == rc_cfg_.ss_number_layers - 1)
superframe_cnt_++;
int qp;
encoder->Control(AOME_GET_LAST_QUANTIZER, &qp);
- rc_api_->ComputeQP(frame_params_);
- ASSERT_EQ(rc_api_->GetQP(), qp);
- int encoder_lpf_level;
- encoder->Control(AOME_GET_LOOPFILTER_LEVEL, &encoder_lpf_level);
- aom::AV1LoopfilterLevel loopfilter_level = rc_api_->GetLoopfilterLevel();
- ASSERT_EQ(loopfilter_level.filter_level[0], encoder_lpf_level);
- aom::AV1CdefInfo cdef_level = rc_api_->GetCdefInfo();
- int cdef_y_strengths[16];
- encoder->Control(AV1E_GET_LUMA_CDEF_STRENGTH, cdef_y_strengths);
- ASSERT_EQ(cdef_level.cdef_strength_y, cdef_y_strengths[0]);
+ if (rc_api_->ComputeQP(frame_params_) == aom::FrameDropDecision::kOk) {
+ ASSERT_EQ(rc_api_->GetQP(), qp) << "at frame " << frame_cnt_ - 1;
+ int encoder_lpf_level;
+ encoder->Control(AOME_GET_LOOPFILTER_LEVEL, &encoder_lpf_level);
+ aom::AV1LoopfilterLevel loopfilter_level = rc_api_->GetLoopfilterLevel();
+ ASSERT_EQ(loopfilter_level.filter_level[0], encoder_lpf_level);
+ aom::AV1CdefInfo cdef_level = rc_api_->GetCdefInfo();
+ int cdef_y_strengths[16];
+ encoder->Control(AV1E_GET_LUMA_CDEF_STRENGTH, cdef_y_strengths);
+ ASSERT_EQ(cdef_level.cdef_strength_y, cdef_y_strengths[0]);
+ } else {
+ num_drops_++;
+ }
}
void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
@@ -181,6 +192,43 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
}
+ void RunOneLayerScreen() {
+ key_interval_ = 10000;
+ SetConfig();
+ rc_cfg_.is_screen = true;
+ rc_cfg_.width = 352;
+ rc_cfg_.height = 288;
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 140);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunOneLayerDropFramesCBR() {
+ key_interval_ = 10000;
+ max_consec_drop_ = 8;
+ frame_drop_thresh_ = 30;
+ SetConfig();
+ rc_cfg_.target_bandwidth = 100;
+ cfg_.rc_target_bitrate = 100;
+ rc_cfg_.max_quantizer = 50;
+ cfg_.rc_max_quantizer = 50;
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Check that some frames were dropped, otherwise test has no value.
+ ASSERT_GE(num_drops_, 1);
+ }
+
void RunOneLayerPeriodicKey() {
key_interval_ = 100;
SetConfig();
@@ -270,6 +318,8 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
rc_cfg_.max_quantizers[0] = 52;
rc_cfg_.min_quantizers[0] = 2;
rc_cfg_.aq_mode = aq_mode_;
+ rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+ rc_cfg_.max_consec_drop = max_consec_drop_;
// Encoder settings for ground truth.
cfg_.g_w = 640;
@@ -288,6 +338,7 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
cfg_.rc_target_bitrate = 1000;
cfg_.kf_min_dist = key_interval_;
cfg_.kf_max_dist = key_interval_;
+ cfg_.rc_dropframe_thresh = frame_drop_thresh_;
}
void SetConfigSvc(int number_spatial_layers, int number_temporal_layers) {
@@ -425,14 +476,22 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
aom_svc_layer_id_t layer_id_;
int layer_frame_cnt_;
int superframe_cnt_;
+ int frame_cnt_;
bool dynamic_temporal_layers_;
bool dynamic_spatial_layers_;
+ int num_drops_;
+ int max_consec_drop_;
+ int frame_drop_thresh_;
};
TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
+TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
+
TEST_P(RcInterfaceTest, OneLayerPeriodicKey) { RunOneLayerPeriodicKey(); }
+TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
TEST_P(RcInterfaceTest, Svc) { RunSvc(); }
TEST_P(RcInterfaceTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index b45b7bb3f..ee1a9893d 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc
@@ -28,18 +28,18 @@
namespace {
using libaom_test::ACMRandom;
-typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask,
- DIFFWTD_MASK_TYPE mask_type,
- const uint8_t *src0, int src0_stride,
- const uint8_t *src1, int src1_stride,
- int h, int w);
+using BuildCompDiffWtdMaskFunc = void (*)(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w);
-typedef std::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
- BuildCompDiffwtdMaskDParam;
+using BuildCompDiffwtdMaskDParam =
+ std::tuple<BLOCK_SIZE, BuildCompDiffWtdMaskFunc>;
-#if HAVE_SSE4_1
+#if HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON
::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams(
- buildcompdiffwtdmaskd_func filter) {
+ BuildCompDiffWtdMaskFunc filter) {
return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
::testing::Values(filter));
}
@@ -48,185 +48,302 @@ typedef std::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
class BuildCompDiffwtdMaskTest
: public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> {
public:
- virtual ~BuildCompDiffwtdMaskTest() {}
+ BuildCompDiffwtdMaskTest() : rnd_(ACMRandom::DeterministicSeed()) {}
+ ~BuildCompDiffwtdMaskTest() override = default;
- virtual void TearDown() {}
- void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
- const DIFFWTD_MASK_TYPE type);
+ protected:
+ void RunTest(BuildCompDiffWtdMaskFunc test_impl, bool is_speed,
+ const DIFFWTD_MASK_TYPE type) {
+ const int sb_type = GET_PARAM(0);
+ const int width = block_size_wide[sb_type];
+ const int height = block_size_high[sb_type];
+ DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
+ for (int i = 0; i < width * height; i++) {
+ src0[i] = rnd_.Rand8();
+ src1[i] = rnd_.Rand8();
+ }
+ const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1,
+ width, height, width);
+ }
+ const double t1 = get_time_mark(&timer);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ test_impl(mask_test, type, src0, width, src1, width, height, width);
+ }
+ const double t2 = get_time_mark(&timer);
+ if (is_speed) {
+ printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+ printf("(%3.2f)\n", t1 / t2);
+ }
+ for (int r = 0; r < height; ++r) {
+ for (int c = 0; c < width; ++c) {
+ ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+ << "[" << r << "," << c << "] " << run_times << " @ " << width
+ << "x" << height << " inv " << type;
+ }
+ }
+ }
private:
ACMRandom rnd_;
};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskTest);
+
+TEST_P(BuildCompDiffwtdMaskTest, match) {
+ RunTest(GET_PARAM(1), 0, DIFFWTD_38);
+ RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
+ RunTest(GET_PARAM(1), 1, DIFFWTD_38);
+ RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
+}
-typedef void (*buildcompdiffwtdmaskd16_func)(
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
+ BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
+ BuildParams(av1_build_compound_diffwtd_mask_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskTest,
+ BuildParams(av1_build_compound_diffwtd_mask_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+using BuildCompDiffWtdMaskHighbdFunc =
+ void (*)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h,
+ int w, int bd);
+
+using BuildCompDiffwtdMaskHighbdParam =
+ std::tuple<BLOCK_SIZE, int, BuildCompDiffWtdMaskHighbdFunc>;
+
+#if HAVE_SSSE3 || HAVE_AVX2 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskHighbdParam>
+BuildParamsHighbd(BuildCompDiffWtdMaskHighbdFunc filter) {
+ return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
+ ::testing::Values(8, 10, 12),
+ ::testing::Values(filter));
+}
+#endif
+
+class BuildCompDiffwtdMaskHighbdTest
+ : public ::testing::TestWithParam<BuildCompDiffwtdMaskHighbdParam> {
+ public:
+ BuildCompDiffwtdMaskHighbdTest() : rnd_(ACMRandom::DeterministicSeed()) {}
+ ~BuildCompDiffwtdMaskHighbdTest() override = default;
+
+ protected:
+ void RunTest(BuildCompDiffWtdMaskHighbdFunc test_impl, bool is_speed,
+ const DIFFWTD_MASK_TYPE type) {
+ const int sb_type = GET_PARAM(0);
+ const int bd = GET_PARAM(1);
+ const int width = block_size_wide[sb_type];
+ const int height = block_size_high[sb_type];
+ const int mask = (1 << bd) - 1;
+ DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, src1[MAX_SB_SQUARE]);
+ for (int i = 0; i < width * height; i++) {
+ src0[i] = rnd_.Rand16() & mask;
+ src1[i] = rnd_.Rand16() & mask;
+ }
+ const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ uint8_t *src0_8 = CONVERT_TO_BYTEPTR(src0);
+ uint8_t *src1_8 = CONVERT_TO_BYTEPTR(src1);
+ av1_build_compound_diffwtd_mask_highbd_c(
+ mask_ref, type, src0_8, width, src1_8, width, height, width, bd);
+ }
+ const double t1 = get_time_mark(&timer);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ uint8_t *src0_8 = CONVERT_TO_BYTEPTR(src0);
+ uint8_t *src1_8 = CONVERT_TO_BYTEPTR(src1);
+ test_impl(mask_test, type, src0_8, width, src1_8, width, height, width,
+ bd);
+ }
+ const double t2 = get_time_mark(&timer);
+
+ if (is_speed) {
+ printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+ printf("(%3.2f)\n", t1 / t2);
+ }
+ for (int r = 0; r < height; ++r) {
+ for (int c = 0; c < width; ++c) {
+ ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+ << "[" << r << "," << c << "] " << run_times << " @ " << width
+ << "x" << height << " inv " << type;
+ }
+ }
+ }
+
+ private:
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskHighbdTest);
+
+TEST_P(BuildCompDiffwtdMaskHighbdTest, match) {
+ RunTest(GET_PARAM(2), 0, DIFFWTD_38);
+ RunTest(GET_PARAM(2), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskHighbdTest, DISABLED_Speed) {
+ RunTest(GET_PARAM(2), 1, DIFFWTD_38);
+ RunTest(GET_PARAM(2), 1, DIFFWTD_38_INV);
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, BuildCompDiffwtdMaskHighbdTest,
+ BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, BuildCompDiffwtdMaskHighbdTest,
+ BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BuildCompDiffwtdMaskHighbdTest,
+ BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_neon));
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+using BuildCompDiffWtdMaskD16Func = void (*)(
uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
ConvolveParams *conv_params, int bd);
-typedef std::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
- BuildCompDiffwtdMaskD16Param;
+using BuildCompDiffwtdMaskD16Param =
+ std::tuple<int, BuildCompDiffWtdMaskD16Func, BLOCK_SIZE>;
-#if HAVE_SSE4_1 || HAVE_NEON
+#if HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON
::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
- buildcompdiffwtdmaskd16_func filter) {
+ BuildCompDiffWtdMaskD16Func filter) {
return ::testing::Combine(::testing::Range(8, 13, 2),
::testing::Values(filter),
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
#endif
+
class BuildCompDiffwtdMaskD16Test
: public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
public:
- ~BuildCompDiffwtdMaskD16Test() {}
- virtual void TearDown() {}
- void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+ BuildCompDiffwtdMaskD16Test() : rnd_(ACMRandom::DeterministicSeed()) {}
+ ~BuildCompDiffwtdMaskD16Test() override = default;
protected:
- void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
- void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl,
- DIFFWTD_MASK_TYPE mask_type);
- libaom_test::ACMRandom rnd_;
-}; // class BuildCompDiffwtdMaskD16Test
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);
-
-void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
- buildcompdiffwtdmaskd16_func test_impl) {
- const int block_idx = GET_PARAM(2);
- const int bd = GET_PARAM(0);
- const int width = block_size_wide[block_idx];
- const int height = block_size_high[block_idx];
- DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
-
- ConvolveParams conv_params =
- get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
-
- int in_precision =
- bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
-
- for (int i = 0; i < MAX_SB_SQUARE; i++) {
- src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
- src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
- }
-
- for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
- av1_build_compound_diffwtd_mask_d16_c(
- mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
- height, width, &conv_params, bd);
-
- test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
- height, width, &conv_params, bd);
+ void RunCheckOutput(BuildCompDiffWtdMaskD16Func test_impl) {
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ const int width = block_size_wide[block_idx];
+ const int height = block_size_high[block_idx];
+ DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+ ConvolveParams conv_params =
+ get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
+
+ const int in_precision =
+ bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+ for (int i = 0; i < MAX_SB_SQUARE; i++) {
+ src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ }
- for (int r = 0; r < height; ++r) {
- for (int c = 0; c < width; ++c) {
- ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
- << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
- << " Pixel mismatch at index "
- << "[" << r << "," << c << "] "
- << " @ " << width << "x" << height << " inv " << mask_type;
+ for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
+ av1_build_compound_diffwtd_mask_d16_c(
+ mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+ height, width, &conv_params, bd);
+
+ test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1,
+ width, height, width, &conv_params, bd);
+
+ for (int r = 0; r < height; ++r) {
+ for (int c = 0; c < width; ++c) {
+ ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+ << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
+ << " Pixel mismatch at index "
+ << "[" << r << "," << c << "] "
+ << " @ " << width << "x" << height << " inv " << mask_type;
+ }
}
}
}
-}
-void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
- buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) {
- const int block_idx = GET_PARAM(2);
- const int bd = GET_PARAM(0);
- const int width = block_size_wide[block_idx];
- const int height = block_size_high[block_idx];
- DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
-
- ConvolveParams conv_params =
- get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
-
- int in_precision =
- bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
-
- for (int i = 0; i < MAX_SB_SQUARE; i++) {
- src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
- src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
- }
+ void RunSpeedTest(BuildCompDiffWtdMaskD16Func test_impl,
+ DIFFWTD_MASK_TYPE mask_type) {
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ const int width = block_size_wide[block_idx];
+ const int height = block_size_high[block_idx];
+ DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+ ConvolveParams conv_params =
+ get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
+
+ const int in_precision =
+ bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+ for (int i = 0; i < MAX_SB_SQUARE; i++) {
+ src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ }
- const int num_loops = 10000000 / (width + height);
- aom_usec_timer timer;
- aom_usec_timer_start(&timer);
+ const int num_loops = 10000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
- for (int i = 0; i < num_loops; ++i)
- av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
- width, height, width, &conv_params,
- bd);
+ for (int i = 0; i < num_loops; ++i)
+ av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
+ width, height, width, &conv_params,
+ bd);
- aom_usec_timer_mark(&timer);
- const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- aom_usec_timer timer1;
- aom_usec_timer_start(&timer1);
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
- for (int i = 0; i < num_loops; ++i)
- test_impl(mask, mask_type, src0, width, src1, width, height, width,
- &conv_params, bd);
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(mask, mask_type, src0, width, src1, width, height, width,
+ &conv_params, bd);
- aom_usec_timer_mark(&timer1);
- const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
- printf("av1_build_compound_diffwtd_mask_d16 %3dx%-3d: %7.2f \n", width,
- height, elapsed_time / double(elapsed_time1));
-}
-#if HAVE_SSE4_1
-void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl,
- const int is_speed,
- const DIFFWTD_MASK_TYPE type) {
- const int sb_type = GET_PARAM(0);
- const int width = block_size_wide[sb_type];
- const int height = block_size_high[sb_type];
- DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- for (int i = 0; i < width * height; i++) {
- src0[i] = rnd.Rand8();
- src1[i] = rnd.Rand8();
- }
- const int run_times = is_speed ? (10000000 / (width + height)) : 1;
- aom_usec_timer timer;
- aom_usec_timer_start(&timer);
- for (int i = 0; i < run_times; ++i) {
- av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1, width,
- height, width);
- }
- const double t1 = get_time_mark(&timer);
- aom_usec_timer_start(&timer);
- for (int i = 0; i < run_times; ++i) {
- test_impl(mask_test, type, src0, width, src1, width, height, width);
- }
- const double t2 = get_time_mark(&timer);
- if (is_speed) {
- printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
- printf("(%3.2f)\n", t1 / t2);
- }
- for (int r = 0; r < height; ++r) {
- for (int c = 0; c < width; ++c) {
- ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
- << "[" << r << "," << c << "] " << run_times << " @ " << width << "x"
- << height << " inv " << type;
- }
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("av1_build_compound_diffwtd_mask_d16 %3dx%-3d: %7.2f \n", width,
+ height, elapsed_time / double(elapsed_time1));
}
-}
-TEST_P(BuildCompDiffwtdMaskTest, match) {
- RunTest(GET_PARAM(1), 0, DIFFWTD_38);
- RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV);
-}
-TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
- RunTest(GET_PARAM(1), 1, DIFFWTD_38);
- RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
-}
-#endif
+ private:
+ ACMRandom rnd_;
+}; // class BuildCompDiffwtdMaskD16Test
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);
+
TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
RunCheckOutput(GET_PARAM(1));
}
@@ -237,18 +354,12 @@ TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
}
#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
- BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
-
INSTANTIATE_TEST_SUITE_P(
SSE4_1, BuildCompDiffwtdMaskD16Test,
BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
#endif
#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
- BuildParams(av1_build_compound_diffwtd_mask_avx2));
-
INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
#endif
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 437b8e080..7bad45300 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -186,7 +186,7 @@ class ResizingVideoSource : public ::libaom_test::DummyVideoSource {
}
int flag_codec_;
bool change_start_resln_;
- virtual ~ResizingVideoSource() {}
+ ~ResizingVideoSource() override = default;
protected:
void Begin() override {
@@ -215,12 +215,12 @@ class ResizeTest
protected:
ResizeTest() : EncoderTest(GET_PARAM(0)) {}
- virtual ~ResizeTest() {}
+ ~ResizeTest() override = default;
- virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
- virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
- libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
if (GET_PARAM(1) == ::libaom_test::kRealTime) {
encoder->Control(AV1E_SET_AQ_MODE, 3);
@@ -230,8 +230,8 @@ class ResizeTest
}
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t pts) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
}
@@ -279,15 +279,15 @@ class ResizeInternalTestLarge : public ResizeTest {
ResizeInternalTestLarge() : ResizeTest(), frame0_psnr_(0.0) {}
#endif
- virtual ~ResizeInternalTestLarge() {}
+ ~ResizeInternalTestLarge() override = default;
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
#if WRITE_COMPRESSED_STREAM
outfile_ = fopen("av10-2-05-resize.ivf", "wb");
#endif
}
- virtual void EndPassHook() {
+ void EndPassHook() override {
#if WRITE_COMPRESSED_STREAM
if (outfile_) {
if (!fseek(outfile_, 0, SEEK_SET))
@@ -298,8 +298,8 @@ class ResizeInternalTestLarge : public ResizeTest {
#endif
}
- virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
- libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
if (change_config_) {
int new_q = 60;
if (video->frame() == 0) {
@@ -323,13 +323,13 @@ class ResizeInternalTestLarge : public ResizeTest {
}
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 4.1);
}
#if WRITE_COMPRESSED_STREAM
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
++out_frames_;
// Write initial file header if first frame.
@@ -402,11 +402,12 @@ class ResizeRealtimeTest
protected:
ResizeRealtimeTest()
: EncoderTest(GET_PARAM(0)), num_threads_(GET_PARAM(3)),
- set_scale_mode_(false), set_scale_mode2_(false) {}
- virtual ~ResizeRealtimeTest() {}
+ set_scale_mode_(false), set_scale_mode2_(false),
+ set_scale_mode3_(false) {}
+ ~ResizeRealtimeTest() override = default;
- virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
- libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_AQ_MODE, 3);
encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
@@ -433,6 +434,13 @@ class ResizeRealtimeTest
else if (video->frame() > 40)
mode = { AOME_THREEFOUR, AOME_THREEFOUR };
encoder->Control(AOME_SET_SCALEMODE, &mode);
+ } else if (set_scale_mode3_) {
+ struct aom_scaling_mode mode;
+ if (video->frame() <= 30)
+ mode = { AOME_ONETWO, AOME_NORMAL };
+ else
+ mode = { AOME_NORMAL, AOME_NORMAL };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
}
if (change_bitrate_ && video->frame() == frame_change_bitrate_) {
@@ -442,17 +450,17 @@ class ResizeRealtimeTest
}
}
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
set_cpu_used_ = GET_PARAM(2);
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t pts) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
}
- virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+ void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
double mismatch_psnr = compute_psnr(img1, img2);
mismatch_psnr_ += mismatch_psnr;
++mismatch_nframes_;
@@ -483,7 +491,7 @@ class ResizeRealtimeTest
// the width and height of the frame are swapped
cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
AOMMAX(kInitialWidth, kInitialHeight);
- if (set_scale_mode_ || set_scale_mode2_) {
+ if (set_scale_mode_ || set_scale_mode2_ || set_scale_mode3_) {
cfg_.rc_dropframe_thresh = 0;
cfg_.g_forced_max_frame_width = 1280;
cfg_.g_forced_max_frame_height = 1280;
@@ -499,6 +507,7 @@ class ResizeRealtimeTest
int mismatch_nframes_;
bool set_scale_mode_;
bool set_scale_mode2_;
+ bool set_scale_mode3_;
};
// Check the AOME_SET_SCALEMODE control by downsizing to
@@ -509,6 +518,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1) {
cfg_.g_h = 720;
set_scale_mode_ = true;
set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
DefaultConfig();
change_bitrate_ = false;
mismatch_nframes_ = 0;
@@ -544,6 +554,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1QVGA) {
cfg_.g_h = 180;
set_scale_mode_ = true;
set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
DefaultConfig();
change_bitrate_ = false;
mismatch_nframes_ = 0;
@@ -578,6 +589,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode2) {
cfg_.g_h = 720;
set_scale_mode_ = false;
set_scale_mode2_ = true;
+ set_scale_mode3_ = false;
DefaultConfig();
change_bitrate_ = false;
mismatch_nframes_ = 0;
@@ -604,12 +616,45 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode2) {
}
}
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2 horizontally only and then back up to original.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode3) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ cfg_.g_w = 1280;
+ cfg_.g_h = 720;
+ set_scale_mode_ = false;
+ set_scale_mode2_ = false;
+ set_scale_mode3_ = true;
+ DefaultConfig();
+ change_bitrate_ = false;
+ mismatch_nframes_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ const auto frame = static_cast<unsigned>(info->pts);
+ unsigned int expected_w = 640;
+ unsigned int expected_h = 720;
+ if (frame > 30) {
+ expected_w = 1280;
+ expected_h = 720;
+ }
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+}
+
TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
ResizingVideoSource video;
video.flag_codec_ = 1;
change_bitrate_ = false;
set_scale_mode_ = false;
set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
mismatch_psnr_ = 0.0;
mismatch_nframes_ = 0;
DefaultConfig();
@@ -651,6 +696,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
change_bitrate_ = false;
set_scale_mode_ = false;
set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
mismatch_psnr_ = 0.0;
mismatch_nframes_ = 0;
DefaultConfig();
@@ -700,6 +746,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
frame_change_bitrate_ = 120;
set_scale_mode_ = false;
set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
mismatch_psnr_ = 0.0;
mismatch_nframes_ = 0;
DefaultConfig();
@@ -757,15 +804,15 @@ class ResizeCspTest : public ResizeTest {
ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
#endif
- virtual ~ResizeCspTest() {}
+ ~ResizeCspTest() override = default;
- virtual void BeginPassHook(unsigned int /*pass*/) {
+ void BeginPassHook(unsigned int /*pass*/) override {
#if WRITE_COMPRESSED_STREAM
outfile_ = fopen("av11-2-05-cspchape.ivf", "wb");
#endif
}
- virtual void EndPassHook() {
+ void EndPassHook() override {
#if WRITE_COMPRESSED_STREAM
if (outfile_) {
if (!fseek(outfile_, 0, SEEK_SET))
@@ -776,13 +823,13 @@ class ResizeCspTest : public ResizeTest {
#endif
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
}
#if WRITE_COMPRESSED_STREAM
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
++out_frames_;
// Write initial file header if first frame.
@@ -809,7 +856,7 @@ class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource {
limit_ = 30;
}
- virtual ~ResizingCspVideoSource() {}
+ ~ResizingCspVideoSource() override = default;
};
#if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH) || \
@@ -845,9 +892,9 @@ class ResizeModeTestLarge
: EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
resize_mode_(GET_PARAM(2)), resize_denominator_(GET_PARAM(3)),
resize_kf_denominator_(GET_PARAM(4)), cpu_used_(GET_PARAM(5)) {}
- virtual ~ResizeModeTestLarge() {}
+ ~ResizeModeTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -861,8 +908,8 @@ class ResizeModeTestLarge
init_flags_ = AOM_CODEC_USE_PSNR;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, cpu_used_);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index 735d799a4..f1f9e019c 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc
@@ -94,9 +94,9 @@ class RTEndToEndTest
aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
tile_columns_(GET_PARAM(5)), tile_rows_(GET_PARAM(6)) {}
- virtual ~RTEndToEndTest() {}
+ ~RTEndToEndTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kRealTime);
cfg_.g_threads = threads_;
@@ -107,18 +107,18 @@ class RTEndToEndTest
cfg_.kf_min_dist = 9999;
}
- virtual void BeginPassHook(unsigned int) {
+ void BeginPassHook(unsigned int) override {
psnr_ = 0.0;
nframes_ = 0;
}
- virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 0a39ca6d4..521274863 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -39,12 +39,6 @@ typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
const uint8_t *second_pred);
typedef std::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
-typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, const uint8_t *ref,
- int ref_stride,
- const DIST_WTD_COMP_PARAMS *jcp_param);
-typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
-
typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -138,15 +132,13 @@ class SADTestBase : public ::testing::Test {
comp_pred16_test_ = nullptr;
}
- virtual void TearDown() {}
-
protected:
// Handle up to 4 128x128 blocks, with stride up to 256
static const int kDataAlignment = 16;
static const int kDataBlockSize = 128 * 256;
static const int kDataBufferSize = 4 * kDataBlockSize;
- virtual void SetUp() {
+ void SetUp() override {
if (bd_ == -1) {
use_high_bit_depth_ = false;
bit_depth_ = AOM_BITS_8;
@@ -255,31 +247,6 @@ class SADTestBase : public ::testing::Test {
return sad;
}
- void ReferenceDistWtdCompAvg(int block_idx) {
- const uint8_t *const reference8 = GetReference(block_idx);
- const uint8_t *const second_pred8 = second_pred_;
- uint8_t *const comp_pred8 = comp_pred_;
- const uint16_t *const reference16 =
- CONVERT_TO_SHORTPTR(GetReference(block_idx));
- const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
- uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
- for (int h = 0; h < height_; ++h) {
- for (int w = 0; w < width_; ++w) {
- if (!use_high_bit_depth_) {
- const int tmp =
- second_pred8[h * width_ + w] * jcp_param_.bck_offset +
- reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
- comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
- } else {
- const int tmp =
- second_pred16[h * width_ + w] * jcp_param_.bck_offset +
- reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
- comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
- }
- }
- }
- }
-
unsigned int ReferenceDistWtdSADavg(int block_idx) {
unsigned int sad = 0;
const uint8_t *const reference8 = GetReference(block_idx);
@@ -401,7 +368,7 @@ class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
}
void SADForSpeedTest(unsigned int *results,
- const uint8_t *const *references) {
+ const uint8_t *const *references) override {
GET_PARAM(2)
(source_data_, source_stride_, references, reference_stride_, results);
}
@@ -432,7 +399,7 @@ class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>,
}
void SADForSpeedTest(unsigned int *results,
- const uint8_t *const *references) {
+ const uint8_t *const *references) override {
GET_PARAM(2)
(source_data_, source_stride_, references, reference_stride_, results);
}
@@ -475,7 +442,7 @@ class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
}
void SADForSpeedTest(unsigned int *results,
- const uint8_t *const *references) {
+ const uint8_t *const *references) override {
GET_PARAM(2)
(source_data_, source_stride_, references, reference_stride_, results);
}
@@ -504,7 +471,7 @@ class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
}
void SADForSpeedTest(unsigned int *results,
- const uint8_t *const *references) {
+ const uint8_t *const *references) override {
GET_PARAM(2)
(source_data_, source_stride_, references[0], reference_stride_);
(void)results;
@@ -534,7 +501,7 @@ class SADSkipTest : public ::testing::WithParamInterface<SadMxNParam>,
}
void SADForSpeedTest(unsigned int *results,
- const uint8_t *const *references) {
+ const uint8_t *const *references) override {
GET_PARAM(2)
(source_data_, source_stride_, references[0], reference_stride_);
(void)results;
@@ -565,40 +532,6 @@ class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
}
};
-class DistWtdCompAvgTest
- : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
- public SADTestBase {
- public:
- DistWtdCompAvgTest()
- : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
-
- protected:
- void dist_wtd_comp_avg(int block_idx) {
- const uint8_t *const reference = GetReference(block_idx);
-
- API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
- height_, reference, reference_stride_,
- &jcp_param_));
- }
-
- void CheckCompAvg() {
- for (int j = 0; j < 2; ++j) {
- for (int i = 0; i < 4; ++i) {
- jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
- jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
-
- ReferenceDistWtdCompAvg(0);
- dist_wtd_comp_avg(0);
-
- for (int y = 0; y < height_; ++y)
- for (int x = 0; x < width_; ++x)
- ASSERT_EQ(comp_pred_[y * width_ + x],
- comp_pred_test_[y * width_ + x]);
- }
- }
- }
-};
-
class DistWtdSADavgTest
: public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
public SADTestBase {
@@ -807,38 +740,6 @@ TEST_P(SADavgTest, ShortSrc) {
source_stride_ = tmp_stride;
}
-TEST_P(DistWtdCompAvgTest, MaxRef) {
- FillConstant(reference_data_, reference_stride_, mask_);
- FillConstant(second_pred_, width_, 0);
- CheckCompAvg();
-}
-
-TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
- FillConstant(reference_data_, reference_stride_, 0);
- FillConstant(second_pred_, width_, mask_);
- CheckCompAvg();
-}
-
-TEST_P(DistWtdCompAvgTest, ShortRef) {
- const int tmp_stride = reference_stride_;
- reference_stride_ >>= 1;
- FillRandom(reference_data_, reference_stride_);
- FillRandom(second_pred_, width_);
- CheckCompAvg();
- reference_stride_ = tmp_stride;
-}
-
-TEST_P(DistWtdCompAvgTest, UnalignedRef) {
- // The reference frame, but not the source frame, may be unaligned for
- // certain types of searches.
- const int tmp_stride = reference_stride_;
- reference_stride_ -= 1;
- FillRandom(reference_data_, reference_stride_);
- FillRandom(second_pred_, width_);
- CheckCompAvg();
- reference_stride_ = tmp_stride;
-}
-
TEST_P(DistWtdSADavgTest, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
FillConstant(reference_data_, reference_stride_, mask_);
@@ -1445,38 +1346,6 @@ const SadMxNAvgParam avg_c_tests[] = {
};
INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
-// TODO(chengchen): add highbd tests
-const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
- make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
-
-#if !CONFIG_REALTIME_ONLY
- make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
- make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-#endif
-};
-
-INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
- ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
-
const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1),
@@ -2227,6 +2096,56 @@ const SadMxNAvgParam avg_neon_tests[] = {
make_tuple(8, 4, &aom_sad8x4_avg_neon, -1),
make_tuple(4, 8, &aom_sad4x8_avg_neon, -1),
make_tuple(4, 4, &aom_sad4x4_avg_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
#if !CONFIG_REALTIME_ONLY
make_tuple(64, 16, &aom_sad64x16_avg_neon, -1),
make_tuple(32, 8, &aom_sad32x8_avg_neon, -1),
@@ -2234,10 +2153,61 @@ const SadMxNAvgParam avg_neon_tests[] = {
make_tuple(16, 4, &aom_sad16x4_avg_neon, -1),
make_tuple(8, 32, &aom_sad8x32_avg_neon, -1),
make_tuple(4, 16, &aom_sad4x16_avg_neon, -1),
-#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
};
INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+const DistWtdSadMxNAvgParam dist_wtd_avg_neon_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_neon, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_neon, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_neon, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_neon, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_neon, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_neon, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_neon, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_neon, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_neon, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_neon, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_neon, -1),
+ make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_neon, -1),
+ make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_neon, -1),
+ make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_neon, -1),
+ make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_neon, -1),
+ make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_neon, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_neon, -1),
+ make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_neon, -1),
+ make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_neon, -1),
+ make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_neon, -1),
+ make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_neon, -1),
+ make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_neon, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_neon_tests));
+
const SadMxNx4Param x3d_neon_tests[] = {
make_tuple(128, 128, &aom_sad128x128x3d_neon, -1),
make_tuple(128, 64, &aom_sad128x64x3d_neon, -1),
@@ -2255,6 +2225,56 @@ const SadMxNx4Param x3d_neon_tests[] = {
make_tuple(8, 4, &aom_sad8x4x3d_neon, -1),
make_tuple(4, 8, &aom_sad4x8x3d_neon, -1),
make_tuple(4, 4, &aom_sad4x4x3d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
#if !CONFIG_REALTIME_ONLY
make_tuple(64, 16, &aom_sad64x16x3d_neon, -1),
make_tuple(32, 8, &aom_sad32x8x3d_neon, -1),
@@ -2262,12 +2282,189 @@ const SadMxNx4Param x3d_neon_tests[] = {
make_tuple(16, 4, &aom_sad16x4x3d_neon, -1),
make_tuple(8, 32, &aom_sad8x32x3d_neon, -1),
make_tuple(4, 16, &aom_sad4x16x3d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
#endif // !CONFIG_REALTIME_ONLY
};
INSTANTIATE_TEST_SUITE_P(NEON, SADx3Test, ::testing::ValuesIn(x3d_neon_tests));
#endif // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+const SadMxNParam neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest,
+ ::testing::ValuesIn(neon_dotprod_tests));
+
+const SadMxNParam skip_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
+ ::testing::ValuesIn(skip_neon_dotprod_tests));
+
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_avg_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64_avg_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128_avg_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64_avg_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32_avg_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64_avg_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32_avg_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16_avg_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32_avg_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16_avg_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_avg_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8_avg_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64_avg_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4_avg_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+ ::testing::ValuesIn(avg_neon_dotprod_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_neon_dotprod_tests));
+
+const SadMxNx4Param x3d_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x3d_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64x3d_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128x3d_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64x3d_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32x3d_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64x3d_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32x3d_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16x3d_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32x3d_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16x3d_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8x3d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x3d_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8x3d_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64x3d_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4x3d_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx3Test,
+ ::testing::ValuesIn(x3d_neon_dotprod_tests));
+
+const SadMxNx4Param x4d_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x4d_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64x4d_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128x4d_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64x4d_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32x4d_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64x4d_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32x4d_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16x4d_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32x4d_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16x4d_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x4d_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8x4d_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64x4d_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4x4d_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test,
+ ::testing::ValuesIn(x4d_neon_dotprod_tests));
+
+const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128x4d_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64x4d_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128x4d_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64x4d_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32x4d_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64x4d_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32x4d_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16x4d_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4x4d_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_neon_dotprod_tests));
+#endif // HAVE_NEON_DOTPROD
+
//------------------------------------------------------------------------------
// x86 functions
#if HAVE_SSE2
@@ -2750,39 +2947,6 @@ INSTANTIATE_TEST_SUITE_P(sse2, DistWtdSADavgTest,
// Only functions are x3, which do not have tests.
#endif // HAVE_SSE3
-#if HAVE_SSSE3
-const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
- make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-#if !CONFIG_REALTIME_ONLY
- make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
- make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-#endif
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
- ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
-#endif // HAVE_SSSE3
-
#if HAVE_SSE4_1
// Only functions are x8, which do not have tests.
#endif // HAVE_SSE4_1
diff --git a/test/sb_multipass_test.cc b/test/sb_multipass_test.cc
index 8ddc0026a..e27a2c60e 100644
--- a/test/sb_multipass_test.cc
+++ b/test/sb_multipass_test.cc
@@ -42,9 +42,9 @@ class AV1SBMultipassTest
md5_dec_.clear();
md5_enc_.clear();
}
- virtual ~AV1SBMultipassTest() { delete decoder_; }
+ ~AV1SBMultipassTest() override { delete decoder_; }
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(::libaom_test::kTwoPassGood);
cfg_.g_lag_in_frames = 5;
@@ -56,8 +56,8 @@ class AV1SBMultipassTest
cfg_.rc_min_quantizer = 0;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
SetTileSize(encoder);
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -75,7 +75,7 @@ class AV1SBMultipassTest
encoder->Control(AV1E_SET_TILE_ROWS, 1);
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
size_enc_.push_back(pkt->data.frame.sz);
::libaom_test::MD5 md5_enc;
diff --git a/test/scalability_test.cc b/test/scalability_test.cc
index 9ea825655..12cb03cac 100644
--- a/test/scalability_test.cc
+++ b/test/scalability_test.cc
@@ -26,15 +26,15 @@ class ScalabilityTest
public ::libaom_test::EncoderTest {
protected:
ScalabilityTest() : EncoderTest(GET_PARAM(0)) {}
- virtual ~ScalabilityTest() {}
+ ~ScalabilityTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
num_spatial_layers_ = 2;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
encoder->Control(AOME_SET_NUMBER_SPATIAL_LAYERS, num_spatial_layers_);
diff --git a/test/screen_content_test.cc b/test/screen_content_test.cc
index 4d3e09a65..974c50b3c 100644
--- a/test/screen_content_test.cc
+++ b/test/screen_content_test.cc
@@ -29,9 +29,9 @@ class ScreenContentToolsTestLarge
is_screen_content_violated_ = true;
tune_content_ = AOM_CONTENT_DEFAULT;
}
- virtual ~ScreenContentToolsTestLarge() {}
+ ~ScreenContentToolsTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -42,10 +42,10 @@ class ScreenContentToolsTestLarge
cfg_.g_profile = 0;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -53,8 +53,8 @@ class ScreenContentToolsTestLarge
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index a8461b596..3dd513b6e 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -30,9 +30,9 @@ using libaom_test::ACMRandom;
using std::make_tuple;
using std::tuple;
-typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
- int eps, const int *xqd, uint8_t *dst8, int dst_stride,
- int32_t *tmpbuf, int bit_depth, int highbd);
+typedef int (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
+ int eps, const int *xqd, uint8_t *dst8, int dst_stride,
+ int32_t *tmpbuf, int bit_depth, int highbd);
// Test parameter list:
// <tst_fun_>
@@ -41,10 +41,8 @@ typedef tuple<SgrFunc> FilterTestParam;
class AV1SelfguidedFilterTest
: public ::testing::TestWithParam<FilterTestParam> {
public:
- virtual ~AV1SelfguidedFilterTest() {}
- virtual void SetUp() {}
-
- virtual void TearDown() {}
+ ~AV1SelfguidedFilterTest() override = default;
+ void SetUp() override {}
protected:
void RunSpeedTest() {
@@ -91,9 +89,10 @@ class AV1SelfguidedFilterTest
int h = AOMMIN(pu_height, height - k);
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
- av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
- output_p, out_stride, tmpbuf, 8,
- 0);
+ const int ret_c = av1_apply_selfguided_restoration_c(
+ input_p, w, h, stride, eps, xqd, output_p, out_stride, tmpbuf, 8,
+ 0);
+ ASSERT_EQ(ret_c, 0);
}
}
aom_usec_timer_mark(&ref_timer);
@@ -108,8 +107,9 @@ class AV1SelfguidedFilterTest
int h = AOMMIN(pu_height, height - k);
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
- tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
- tmpbuf, 8, 0);
+ const int ret_tst = tst_fun_(input_p, w, h, stride, eps, xqd,
+ output_p, out_stride, tmpbuf, 8, 0);
+ ASSERT_EQ(ret_tst, 0);
}
}
aom_usec_timer_mark(&tst_timer);
@@ -181,11 +181,13 @@ class AV1SelfguidedFilterTest
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
uint8_t *output2_p = output2 + k * out_stride + j;
- tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
- tmpbuf, 8, 0);
- av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
- output2_p, out_stride, tmpbuf, 8,
- 0);
+ const int ret_tst = tst_fun_(input_p, w, h, stride, eps, xqd,
+ output_p, out_stride, tmpbuf, 8, 0);
+ ASSERT_EQ(ret_tst, 0);
+ const int ret_c = av1_apply_selfguided_restoration_c(
+ input_p, w, h, stride, eps, xqd, output2_p, out_stride, tmpbuf, 8,
+ 0);
+ ASSERT_EQ(ret_c, 0);
}
for (j = 0; j < test_h; ++j)
@@ -234,10 +236,8 @@ typedef tuple<SgrFunc, int> HighbdFilterTestParam;
class AV1HighbdSelfguidedFilterTest
: public ::testing::TestWithParam<HighbdFilterTestParam> {
public:
- virtual ~AV1HighbdSelfguidedFilterTest() {}
- virtual void SetUp() {}
-
- virtual void TearDown() {}
+ ~AV1HighbdSelfguidedFilterTest() override = default;
+ void SetUp() override {}
protected:
void RunSpeedTest() {
diff --git a/test/sharpness_test.cc b/test/sharpness_test.cc
index 49c5804d2..64465c88e 100644
--- a/test/sharpness_test.cc
+++ b/test/sharpness_test.cc
@@ -48,7 +48,7 @@ class SharpnessTest
cpu_used_(GET_PARAM(2)), sharpness_level_(GET_PARAM(3)), psnr_(0.0),
nframes_(0) {}
- ~SharpnessTest() override {}
+ ~SharpnessTest() override = default;
void SetUp() override {
InitializeConfig(encoding_mode_);
diff --git a/test/simd_impl.h b/test/simd_impl.h
index 8535e37cd..b564a7f4b 100644
--- a/test/simd_impl.h
+++ b/test/simd_impl.h
@@ -22,15 +22,13 @@ namespace SIMD_NAMESPACE {
template <typename param_signature>
class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
public:
- virtual ~TestIntrinsic() {}
- virtual void SetUp() {
+ ~TestIntrinsic() override = default;
+ void SetUp() override {
mask = std::get<0>(this->GetParam());
maskwidth = std::get<1>(this->GetParam());
name = std::get<2>(this->GetParam());
}
- virtual void TearDown() {}
-
protected:
uint32_t mask, maskwidth;
const char *name;
diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc
index 68355ece5..70d8da5e4 100644
--- a/test/sse_sum_test.cc
+++ b/test/sse_sum_test.cc
@@ -41,15 +41,15 @@ typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
class SumSSETest : public ::testing::TestWithParam<TestFuncs> {
public:
- virtual ~SumSSETest() {}
- virtual void SetUp() {
+ ~SumSSETest() override = default;
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
ASSERT_NE(src_, nullptr);
}
- virtual void TearDown() { aom_free(src_); }
+ void TearDown() override { aom_free(src_); }
void RunTest(int isRandom);
void RunSpeedTest();
diff --git a/test/still_picture_test.cc b/test/still_picture_test.cc
index e2eef94f9..3dfb1c869 100644
--- a/test/still_picture_test.cc
+++ b/test/still_picture_test.cc
@@ -27,9 +27,9 @@ class StillPicturePresenceTest
enable_full_header_(GET_PARAM(2)) {
still_picture_coding_violated_ = false;
}
- virtual ~StillPicturePresenceTest() {}
+ ~StillPicturePresenceTest() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -39,18 +39,18 @@ class StillPicturePresenceTest
cfg_.g_limit = 1;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AV1E_SET_FORCE_VIDEO_MODE, 0);
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 4003e51dc..e591e6543 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -49,15 +49,15 @@ class AV1SubtractBlockTestBase : public ::testing::Test {
func_ = func;
ref_func_ = ref_func;
if (bit_depth == -1) {
- hbd_ = 0;
+ hbd_ = false;
bit_depth_ = AOM_BITS_8;
} else {
- hbd_ = 1;
+ hbd_ = true;
bit_depth_ = static_cast<aom_bit_depth_t>(bit_depth);
}
}
- virtual void SetUp() {
+ void SetUp() override {
rnd_.Reset(ACMRandom::DeterministicSeed());
const size_t max_width = 128;
@@ -82,7 +82,7 @@ class AV1SubtractBlockTestBase : public ::testing::Test {
ASSERT_NE(diff_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
if (hbd_) {
aom_free(CONVERT_TO_SHORTPTR(src_));
aom_free(CONVERT_TO_SHORTPTR(pred_));
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 91f172d62..cba33b783 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -20,6 +20,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
+#include "av1/common/common_data.h"
#include "test/acm_random.h"
#include "test/register_state_check.h"
#include "test/util.h"
@@ -43,16 +44,16 @@ typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
public:
- virtual ~SumSquaresTest() {}
- virtual void SetUp() {
+ ~SumSquaresTest() override = default;
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
ASSERT_NE(src_, nullptr);
}
- virtual void TearDown() { aom_free(src_); }
- void RunTest(int isRandom);
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random);
void RunSpeedTest();
void GenRandomData(int width, int height, int stride) {
@@ -83,7 +84,7 @@ class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
-void SumSquaresTest::RunTest(int isRandom) {
+void SumSquaresTest::RunTest(bool is_random) {
int failed = 0;
for (int k = 0; k < kNumIterations; k++) {
const int width = 4 * (rnd_(31) + 1); // Up to 128x128
@@ -92,7 +93,7 @@ void SumSquaresTest::RunTest(int isRandom) {
while (stride < width) { // Make sure it's valid
stride = 4 << rnd_(7);
}
- if (isRandom) {
+ if (is_random) {
GenRandomData(width, height, stride);
} else {
GenExtremeData(width, height, stride);
@@ -144,11 +145,11 @@ void SumSquaresTest::RunSpeedTest() {
}
TEST_P(SumSquaresTest, OperationCheck) {
- RunTest(1); // GenRandomData
+ RunTest(true); // GenRandomData
}
TEST_P(SumSquaresTest, ExtremeValues) {
- RunTest(0); // GenExtremeData
+ RunTest(false); // GenExtremeData
}
TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); }
@@ -182,7 +183,7 @@ INSTANTIATE_TEST_SUITE_P(
// 1D version
//////////////////////////////////////////////////////////////////////////////
-typedef uint64_t (*F1D)(const int16_t *src, uint32_t N);
+typedef uint64_t (*F1D)(const int16_t *src, uint32_t n);
typedef libaom_test::FuncParam<F1D> TestFuncs1D;
class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
@@ -199,12 +200,12 @@ TEST_P(SumSquares1DTest, RandomValues) {
for (int i = 0; i < kMaxSize * kMaxSize; ++i)
src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
- const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+ const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
: rng_(kMaxSize) + 1;
- const uint64_t ref_res = params_.ref_func(src, N);
+ const uint64_t ref_res = params_.ref_func(src, n);
uint64_t tst_res;
- API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
ASSERT_EQ(ref_res, tst_res);
}
@@ -220,12 +221,12 @@ TEST_P(SumSquares1DTest, ExtremeValues) {
for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max;
}
- const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+ const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
: rng_(kMaxSize) + 1;
- const uint64_t ref_res = params_.ref_func(src, N);
+ const uint64_t ref_res = params_.ref_func(src, n);
uint64_t tst_res;
- API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
ASSERT_EQ(ref_res, tst_res);
}
@@ -245,23 +246,23 @@ INSTANTIATE_TEST_SUITE_P(NEON, SumSquares1DTest,
#endif // HAVE_NEON
-typedef int64_t (*sse_func)(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height);
-typedef libaom_test::FuncParam<sse_func> TestSSEFuncs;
+typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height);
+typedef libaom_test::FuncParam<SSEFunc> TestSSEFuncs;
typedef std::tuple<TestSSEFuncs, int> SSETestParam;
class SSETest : public ::testing::TestWithParam<SSETestParam> {
public:
- virtual ~SSETest() {}
- virtual void SetUp() {
+ ~SSETest() override = default;
+ void SetUp() override {
params_ = GET_PARAM(0);
width_ = GET_PARAM(1);
- isHbd_ =
+ is_hbd_ =
#if CONFIG_AV1_HIGHBITDEPTH
params_.ref_func == aom_highbd_sse_c;
#else
- 0;
+ false;
#endif
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
@@ -270,25 +271,25 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
ASSERT_NE(ref_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(ref_);
}
- void RunTest(int isRandom, int width, int height, int run_times);
+ void RunTest(bool is_random, int width, int height, int run_times);
void GenRandomData(int width, int height, int stride) {
- uint16_t *pSrc = (uint16_t *)src_;
- uint16_t *pRef = (uint16_t *)ref_;
+ uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
+ uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
const int msb = 11; // Up to 12 bit input
const int limit = 1 << (msb + 1);
for (int ii = 0; ii < height; ii++) {
for (int jj = 0; jj < width; jj++) {
- if (!isHbd_) {
+ if (!is_hbd_) {
src_[ii * stride + jj] = rnd_.Rand8();
ref_[ii * stride + jj] = rnd_.Rand8();
} else {
- pSrc[ii * stride + jj] = rnd_(limit);
- pRef[ii * stride + jj] = rnd_(limit);
+ src16[ii * stride + jj] = rnd_(limit);
+ ref16[ii * stride + jj] = rnd_(limit);
}
}
}
@@ -296,20 +297,20 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
void GenExtremeData(int width, int height, int stride, uint8_t *data,
int16_t val) {
- uint16_t *pData = (uint16_t *)data;
+ uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
for (int ii = 0; ii < height; ii++) {
for (int jj = 0; jj < width; jj++) {
- if (!isHbd_) {
- data[ii * stride + jj] = (uint8_t)val;
+ if (!is_hbd_) {
+ data[ii * stride + jj] = static_cast<uint8_t>(val);
} else {
- pData[ii * stride + jj] = val;
+ data16[ii * stride + jj] = val;
}
}
}
}
protected:
- int isHbd_;
+ bool is_hbd_;
int width_;
TestSSEFuncs params_;
uint8_t *src_;
@@ -318,7 +319,7 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
-void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
+void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
int failed = 0;
aom_usec_timer ref_timer, test_timer;
for (int k = 0; k < 3; k++) {
@@ -326,10 +327,10 @@ void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
while (stride < width) { // Make sure it's valid
stride = 4 << rnd_(7);
}
- if (isRandom) {
+ if (is_random) {
GenRandomData(width, height, stride);
} else {
- const int msb = isHbd_ ? 12 : 8; // Up to 12 bit input
+ const int msb = is_hbd_ ? 12 : 8; // Up to 12 bit input
const int limit = (1 << msb) - 1;
if (k == 0) {
GenExtremeData(width, height, stride, src_, 0);
@@ -340,18 +341,18 @@ void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
}
}
int64_t res_ref, res_tst;
- uint8_t *pSrc = src_;
- uint8_t *pRef = ref_;
- if (isHbd_) {
- pSrc = CONVERT_TO_BYTEPTR(src_);
- pRef = CONVERT_TO_BYTEPTR(ref_);
+ uint8_t *src = src_;
+ uint8_t *ref = ref_;
+ if (is_hbd_) {
+ src = CONVERT_TO_BYTEPTR(src_);
+ ref = CONVERT_TO_BYTEPTR(ref_);
}
- res_ref = params_.ref_func(pSrc, stride, pRef, stride, width, height);
- res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height);
+ res_ref = params_.ref_func(src, stride, ref, stride, width, height);
+ res_tst = params_.tst_func(src, stride, ref, stride, width, height);
if (run_times > 1) {
aom_usec_timer_start(&ref_timer);
for (int j = 0; j < run_times; j++) {
- params_.ref_func(pSrc, stride, pRef, stride, width, height);
+ params_.ref_func(src, stride, ref, stride, width, height);
}
aom_usec_timer_mark(&ref_timer);
const int elapsed_time_c =
@@ -359,7 +360,7 @@ void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
aom_usec_timer_start(&test_timer);
for (int j = 0; j < run_times; j++) {
- params_.tst_func(pSrc, stride, pRef, stride, width, height);
+ params_.tst_func(src, stride, ref, stride, width, height);
}
aom_usec_timer_mark(&test_timer);
const int elapsed_time_simd =
@@ -374,7 +375,7 @@ void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
if (!failed) {
failed = res_ref != res_tst;
EXPECT_EQ(res_ref, res_tst)
- << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test ["
+ << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
<< width << "x" << height
<< "] C output does not match optimized output.";
}
@@ -384,19 +385,19 @@ void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
TEST_P(SSETest, OperationCheck) {
for (int height = 4; height <= 128; height += 4) {
- RunTest(1, width_, height, 1); // GenRandomData
+ RunTest(true, width_, height, 1); // GenRandomData
}
}
TEST_P(SSETest, ExtremeValues) {
for (int height = 4; height <= 128; height += 4) {
- RunTest(0, width_, height, 1);
+ RunTest(false, width_, height, 1);
}
}
TEST_P(SSETest, DISABLED_Speed) {
for (int height = 4; height <= 128; height += 4) {
- RunTest(1, width_, height, 100);
+ RunTest(true, width_, height, 100);
}
}
@@ -411,6 +412,14 @@ INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
#endif // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+TestSSEFuncs sse_neon_dotprod[] = {
+ TestSSEFuncs(&aom_sse_c, &aom_sse_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
+ Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
+#endif // HAVE_NEON_DOTPROD
+
#if HAVE_SSE4_1
TestSSEFuncs sse_sse4[] = {
TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
@@ -442,21 +451,20 @@ typedef void (*sse_sum_func)(const int16_t *data, int stride, int bw, int bh,
int *x_sum, int64_t *x2_sum);
typedef libaom_test::FuncParam<sse_sum_func> TestSSE_SumFuncs;
-typedef std::tuple<TestSSE_SumFuncs, int> SSE_SumTestParam;
+typedef std::tuple<TestSSE_SumFuncs, TX_SIZE> SSE_SumTestParam;
class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
public:
- virtual ~SSE_Sum_Test() {}
- virtual void SetUp() {
+ ~SSE_Sum_Test() override = default;
+ void SetUp() override {
params_ = GET_PARAM(0);
- width_ = GET_PARAM(1);
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<int16_t *>(aom_memalign(32, 256 * 256 * 2));
ASSERT_NE(src_, nullptr);
}
- virtual void TearDown() { aom_free(src_); }
- void RunTest(int isRandom, int width, int height, int run_times);
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random, int tx_size, int run_times);
void GenRandomData(int width, int height, int stride) {
const int msb = 11; // Up to 12 bit input
@@ -478,21 +486,22 @@ class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
}
protected:
- int width_;
TestSSE_SumFuncs params_;
int16_t *src_;
ACMRandom rnd_;
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSE_Sum_Test);
-void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) {
+void SSE_Sum_Test::RunTest(bool is_random, int tx_size, int run_times) {
aom_usec_timer ref_timer, test_timer;
+ int width = tx_size_wide[tx_size];
+ int height = tx_size_high[tx_size];
for (int k = 0; k < 3; k++) {
int stride = 4 << rnd_(7); // Up to 256 stride
while (stride < width) { // Make sure it's valid
stride = 4 << rnd_(7);
}
- if (isRandom) {
+ if (is_random) {
GenRandomData(width, height, stride);
} else {
const int msb = 12; // Up to 12 bit input
@@ -547,37 +556,45 @@ void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) {
}
TEST_P(SSE_Sum_Test, OperationCheck) {
- for (int height = 4; height <= 64; height = height * 2) {
- RunTest(1, width_, height, 1); // GenRandomData
- }
+ RunTest(true, GET_PARAM(1), 1); // GenRandomData
}
-TEST_P(SSE_Sum_Test, ExtremeValues) {
- for (int height = 4; height <= 64; height = height * 2) {
- RunTest(0, width_, height, 1);
- }
-}
+TEST_P(SSE_Sum_Test, ExtremeValues) { RunTest(false, GET_PARAM(1), 1); }
-TEST_P(SSE_Sum_Test, DISABLED_Speed) {
- for (int height = 4; height <= 64; height = height * 2) {
- RunTest(1, width_, height, 10000);
- }
-}
+TEST_P(SSE_Sum_Test, DISABLED_Speed) { RunTest(true, GET_PARAM(1), 10000); }
+
+#if HAVE_SSE2 || HAVE_AVX2 || HAVE_NEON
+const TX_SIZE kValidBlockSize[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32,
+ TX_64X64, TX_4X8, TX_8X4, TX_8X16,
+ TX_16X8, TX_16X32, TX_32X16, TX_64X32,
+ TX_32X64, TX_4X16, TX_16X4, TX_8X32,
+ TX_32X8, TX_16X64, TX_64X16 };
+#endif
#if HAVE_SSE2
TestSSE_SumFuncs sse_sum_sse2[] = { TestSSE_SumFuncs(
&aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_sse2) };
INSTANTIATE_TEST_SUITE_P(SSE2, SSE_Sum_Test,
- Combine(ValuesIn(sse_sum_sse2), Range(4, 65, 4)));
+ Combine(ValuesIn(sse_sum_sse2),
+ ValuesIn(kValidBlockSize)));
#endif // HAVE_SSE2
#if HAVE_AVX2
TestSSE_SumFuncs sse_sum_avx2[] = { TestSSE_SumFuncs(
&aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_avx2) };
INSTANTIATE_TEST_SUITE_P(AVX2, SSE_Sum_Test,
- Combine(ValuesIn(sse_sum_avx2), Range(4, 65, 4)));
+ Combine(ValuesIn(sse_sum_avx2),
+ ValuesIn(kValidBlockSize)));
#endif // HAVE_AVX2
+#if HAVE_NEON
+TestSSE_SumFuncs sse_sum_neon[] = { TestSSE_SumFuncs(
+ &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_neon) };
+INSTANTIATE_TEST_SUITE_P(NEON, SSE_Sum_Test,
+ Combine(ValuesIn(sse_sum_neon),
+ ValuesIn(kValidBlockSize)));
+#endif // HAVE_NEON
+
//////////////////////////////////////////////////////////////////////////////
// 2D Variance test functions
//////////////////////////////////////////////////////////////////////////////
@@ -589,8 +606,8 @@ const uint16_t test_block_size[2] = { 128, 256 };
class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
public:
- virtual ~Lowbd2dVarTest() {}
- virtual void SetUp() {
+ ~Lowbd2dVarTest() override = default;
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<uint8_t *>(
@@ -598,8 +615,8 @@ class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
ASSERT_NE(src_, nullptr);
}
- virtual void TearDown() { aom_free(src_); }
- void RunTest(int isRandom);
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random);
void RunSpeedTest();
void GenRandomData(int width, int height, int stride) {
@@ -630,7 +647,7 @@ class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Lowbd2dVarTest);
-void Lowbd2dVarTest::RunTest(int isRandom) {
+void Lowbd2dVarTest::RunTest(bool is_random) {
int failed = 0;
for (int k = 0; k < kNumIterations; k++) {
const int width = 4 * (rnd_(63) + 1); // Up to 256x256
@@ -639,7 +656,7 @@ void Lowbd2dVarTest::RunTest(int isRandom) {
while (stride < width) { // Make sure it's valid
stride = 4 << rnd_(8);
}
- if (isRandom) {
+ if (is_random) {
GenRandomData(width, height, stride);
} else {
GenExtremeData(width, height, stride);
@@ -690,11 +707,11 @@ void Lowbd2dVarTest::RunSpeedTest() {
}
TEST_P(Lowbd2dVarTest, OperationCheck) {
- RunTest(1); // GenRandomData
+ RunTest(true); // GenRandomData
}
TEST_P(Lowbd2dVarTest, ExtremeValues) {
- RunTest(0); // GenExtremeData
+ RunTest(false); // GenExtremeData
}
TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
@@ -723,10 +740,18 @@ INSTANTIATE_TEST_SUITE_P(NEON, Lowbd2dVarTest,
#endif // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, Lowbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(
+ &aom_var_2d_u8_c, &aom_var_2d_u8_neon_dotprod)));
+
+#endif // HAVE_NEON_DOTPROD
+
class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
public:
- virtual ~Highbd2dVarTest() {}
- virtual void SetUp() {
+ ~Highbd2dVarTest() override = default;
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<uint16_t *>(
@@ -734,8 +759,8 @@ class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
ASSERT_NE(src_, nullptr);
}
- virtual void TearDown() { aom_free(src_); }
- void RunTest(int isRandom);
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random);
void RunSpeedTest();
void GenRandomData(int width, int height, int stride) {
@@ -766,7 +791,7 @@ class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Highbd2dVarTest);
-void Highbd2dVarTest::RunTest(int isRandom) {
+void Highbd2dVarTest::RunTest(bool is_random) {
int failed = 0;
for (int k = 0; k < kNumIterations; k++) {
const int width = 4 * (rnd_(63) + 1); // Up to 256x256
@@ -775,7 +800,7 @@ void Highbd2dVarTest::RunTest(int isRandom) {
while (stride < width) { // Make sure it's valid
stride = 4 << rnd_(8);
}
- if (isRandom) {
+ if (is_random) {
GenRandomData(width, height, stride);
} else {
GenExtremeData(width, height, stride);
@@ -828,11 +853,11 @@ void Highbd2dVarTest::RunSpeedTest() {
}
TEST_P(Highbd2dVarTest, OperationCheck) {
- RunTest(1); // GenRandomData
+ RunTest(true); // GenRandomData
}
TEST_P(Highbd2dVarTest, ExtremeValues) {
- RunTest(0); // GenExtremeData
+ RunTest(false); // GenExtremeData
}
TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index d99d6a308..cc3fb674b 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -9,6 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <climits>
#include <vector>
#include "config/aom_config.h"
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -45,22 +46,22 @@ class DatarateTestSVC
}
protected:
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(GET_PARAM(1));
ResetModel();
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- aom_codec_pts_t pts) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
++decoded_nframes_;
}
std::vector<FrameInfo> frame_info_list_;
- virtual int GetNumSpatialLayers() { return number_spatial_layers_; }
+ int GetNumSpatialLayers() override { return number_spatial_layers_; }
- virtual void ResetModel() {
+ void ResetModel() override {
DatarateTest::ResetModel();
layer_frame_cnt_ = 0;
superframe_cnt_ = 0;
@@ -94,10 +95,11 @@ class DatarateTestSVC
rps_recovery_frame_ = 0;
user_define_frame_qp_ = 0;
set_speed_per_layer_ = false;
+ simulcast_mode_ = false;
}
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
int spatial_layer_id = 0;
current_video_frame_ = video->frame();
// video->frame() is called every superframe, so we should condition
@@ -144,7 +146,7 @@ class DatarateTestSVC
video->frame(), &layer_id_, &ref_frame_config_, &ref_frame_comp_pred_,
spatial_layer_id, multi_ref_, comp_pred_,
(video->frame() % cfg_.kf_max_dist) == 0, dynamic_enable_disable_mode_,
- rps_mode_, rps_recovery_frame_);
+ rps_mode_, rps_recovery_frame_, simulcast_mode_);
if (intra_only_ == 1 && frame_sync_ > 0) {
// Set an Intra-only frame on SL0 at frame_sync_.
// In order to allow decoding to start on SL0 in mid-sequence we need to
@@ -227,7 +229,7 @@ class DatarateTestSVC
}
}
- virtual void PostEncodeFrameHook(::libaom_test::Encoder *encoder) {
+ void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
int num_operating_points;
encoder->Control(AV1E_GET_NUM_OPERATING_POINTS, &num_operating_points);
ASSERT_EQ(num_operating_points,
@@ -242,7 +244,7 @@ class DatarateTestSVC
}
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
// Update the layer cumulative bitrate.
for (int i = layer_id_.temporal_layer_id; i < number_temporal_layers_;
@@ -254,36 +256,50 @@ class DatarateTestSVC
last_pts_ = pkt->data.frame.pts;
superframe_cnt_++;
}
+ // For simulcast mode: verify that for first frame to start decoding,
+ // for SL > 0, are Intra-only frames (not Key), whereas SL0 is Key.
+ if (simulcast_mode_ && superframe_cnt_ == (int)frame_to_start_decoding_) {
+ if (layer_id_.spatial_layer_id > 0) {
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ } else if (layer_id_.spatial_layer_id == 0) {
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ }
+ }
}
- virtual void EndPassHook() {
+ void EndPassHook() override {
duration_ = ((last_pts_ + 1) * timebase_);
for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
effective_datarate_tl[i] = (effective_datarate_tl[i] / 1000) / duration_;
}
}
- virtual bool DoDecode() const {
+ bool DoDecode() const override {
if (drop_frames_ > 0) {
for (unsigned int i = 0; i < drop_frames_; ++i) {
if (drop_frames_list_[i] == (unsigned int)superframe_cnt_) {
std::cout << " Skipping decoding frame: "
<< drop_frames_list_[i] << "\n";
- return 0;
+ return false;
}
}
} else if (intra_only_ == 1) {
// Only start decoding at frames_to_start_decoding_.
- if (current_video_frame_ < frame_to_start_decoding_) return 0;
+ if (current_video_frame_ < frame_to_start_decoding_) return false;
// Only decode base layer for 3SL, for layer_to_decode_ = 0.
if (layer_to_decode_ == 0 && frame_sync_ > 0 &&
(layer_frame_cnt_ - 1) % 3 != 0)
- return 0;
- }
- return 1;
+ return false;
+ } else if (simulcast_mode_) {
+ // Only start decoding at frames_to_start_decoding_ and only
+ // for top spatial layer SL2 (layer_to_decode_).
+ if (current_video_frame_ < frame_to_start_decoding_) return false;
+ if (layer_id_.spatial_layer_id < (int)layer_to_decode_) return false;
+ }
+ return true;
}
- virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+ void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
double mismatch_psnr = compute_psnr(img1, img2);
mismatch_psnr_ += mismatch_psnr;
++mismatch_nframes_;
@@ -345,13 +361,301 @@ class DatarateTestSVC
}
}
+ // Simulcast mode for 3 spatial and 3 temporal layers.
+ // No inter-layer predicton, only prediction is temporal and single
+ // reference (LAST).
+ // No overlap in buffer slots between spatial layers. So for example,
+ // SL0 only uses slots 0 and 1.
+ // SL1 only uses slots 2 and 3.
+ // SL2 only uses slots 4 and 5.
+ // All 7 references for each inter-frame must only access buffer slots
+ // for that spatial layer.
+ // On key (super)frames: SL1 and SL2 must have no references set
+ // and must refresh all the slots for that layer only (so 2 and 3
+ // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally
+ // as a Key frame (refresh all slots). SL1/SL2 will be labelled
+ // internally as Intra-only frames that allow that stream to be decoded.
+ // These conditions will allow for each spatial stream to be
+ // independently decodeable.
+ static void ref_config_simulcast3SL3TL(
+ aom_svc_ref_frame_config_t *ref_frame_config,
+ aom_svc_layer_id_t *layer_id, int is_key_frame, int superframe_cnt) {
+ int i;
+ // Initialize all references to 0 (don't use reference).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->reference[i] = 0;
+ // Initialize as no refresh/update for all slots.
+ for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0;
+
+ if (is_key_frame) {
+ if (layer_id->spatial_layer_id == 0) {
+ // Assign LAST/GOLDEN to slot 0/1.
+ // Refesh slots 0 and 1 for SL0.
+ // SL0: this will get set to KEY frame internally.
+ ref_frame_config->ref_idx[0] = 0;
+ ref_frame_config->ref_idx[3] = 1;
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Assign LAST/GOLDEN to slot 2/3.
+ // Refesh slots 2 and 3 for SL1.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[2] = 1;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Assign LAST/GOLDEN to slot 4/5.
+ // Refresh slots 4 and 5 for SL2.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->ref_idx[3] = 5;
+ ref_frame_config->refresh[4] = 1;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if (superframe_cnt % 4 == 0) {
+ // Base temporal layer: TL0
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST. Assign all references to either slot
+ // 0 or 1. Here we assign LAST to slot 0, all others to 1.
+ // Update slot 0 (LAST).
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST. Assign all references to either slot
+ // 2 or 3. Here we assign LAST to slot 2, all others to 3.
+ // Update slot 2 (LAST).
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->refresh[2] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST. Assign all references to either slot
+ // 4 or 5. Here we assign LAST to slot 4, all others to 5.
+ // Update slot 4 (LAST).
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->refresh[4] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0). Assign other references to slot 1.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 0;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2). Assign other references to slot 3.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 2;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[0] = 4;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer: TL1
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 1 and update slot 1.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 0;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4).
+ // Set GOLDEN to slot 5 and update slot 5.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 1). Assign other references to slot 0.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 3). Assign other references to slot 2.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 2;
+ ref_frame_config->ref_idx[0] = 3;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 5). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[0] = 5;
+ }
+ }
+ }
+
+ // 3 spatial and 3 temporal layer.
+ // Overlap in the buffer slot updates: the slots 3 and 4 updated by
+ // first TL2 are reused for update in TL1 superframe.
+ static void ref_config_3SL3TL(aom_svc_ref_frame_config_t *ref_frame_config,
+ aom_svc_layer_id_t *layer_id, int is_key_frame,
+ int superframe_cnt) {
+ if (superframe_cnt % 4 == 0) {
+ // Base temporal layer.
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST.
+ // Set all buffer_idx to 0.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 0.
+ // Update slot 1 (LAST).
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 1.
+ // Update slot 2 (LAST).
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->refresh[2] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to slot 0.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // Set LAST2 to slot 4 and Update slot 4.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->ref_idx[1] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 4.
+ // No update.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[0] = 2;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer.
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST.
+ // Set all buffer_idx to 0.
+ // Set GOLDEN to slot 3 and update slot 3.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // Set LAST2 to slot 4 and update slot 4.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->ref_idx[2] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 4.
+ // Set LAST2 to slot 5 and update slot 5.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->ref_idx[2] = 5;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ // Set LAST to slot 3 and reference LAST.
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to 0.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 3;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 4,
+ // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->ref_idx[1] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 5,
+ // GOLDEN to slot 4. No update.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 5;
+ ref_frame_config->ref_idx[3] = 4;
+ }
+ }
+ if (layer_id->spatial_layer_id > 0) {
+ // Always reference GOLDEN (inter-layer prediction).
+ ref_frame_config->reference[3] = 1;
+ if (is_key_frame && layer_id->spatial_layer_id > 0) {
+ // On superframes whose base is key: remove LAST since GOLDEN
+ // is used as reference.
+ ref_frame_config->reference[0] = 0;
+ }
+ }
+ }
+
// Layer pattern configuration.
virtual int set_layer_pattern(
int frame_cnt, aom_svc_layer_id_t *layer_id,
aom_svc_ref_frame_config_t *ref_frame_config,
aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int spatial_layer,
int multi_ref, int comp_pred, int is_key_frame,
- int dynamic_enable_disable_mode, int rps_mode, int rps_recovery_frame) {
+ int dynamic_enable_disable_mode, int rps_mode, int rps_recovery_frame,
+ int simulcast_mode) {
int lag_index = 0;
int base_count = frame_cnt >> 2;
layer_id->spatial_layer_id = spatial_layer;
@@ -506,129 +810,22 @@ class DatarateTestSVC
// Reference GOLDEN.
if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
} else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) {
- // 3 spatial and 3 temporal layer.
- // Overlap in the buffer slot updates: the slots 3 and 4 updated by
- // first TL2 are reused for update in TL1 superframe.
- if (superframe_cnt_ % 4 == 0) {
- // Base temporal layer.
- layer_id->temporal_layer_id = 0;
- if (layer_id->spatial_layer_id == 0) {
- // Reference LAST, update LAST.
- // Set all buffer_idx to 0.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
- ref_frame_config->refresh[0] = 1;
- } else if (layer_id->spatial_layer_id == 1) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
- // GOLDEN (and all other refs) to slot 0.
- // Update slot 1 (LAST).
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
- ref_frame_config->ref_idx[0] = 1;
- ref_frame_config->refresh[1] = 1;
- } else if (layer_id->spatial_layer_id == 2) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
- // GOLDEN (and all other refs) to slot 1.
- // Update slot 2 (LAST).
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
- ref_frame_config->ref_idx[0] = 2;
- ref_frame_config->refresh[2] = 1;
- }
- } else if ((superframe_cnt_ - 1) % 4 == 0) {
- // First top temporal enhancement layer.
- layer_id->temporal_layer_id = 2;
- if (layer_id->spatial_layer_id == 0) {
- // Reference LAST (slot 0).
- // Set GOLDEN to slot 3 and update slot 3.
- // Set all other buffer_idx to slot 0.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
- ref_frame_config->ref_idx[3] = 3;
- ref_frame_config->refresh[3] = 1;
- } else if (layer_id->spatial_layer_id == 1) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
- // GOLDEN (and all other refs) to slot 3.
- // Set LAST2 to slot 4 and Update slot 4.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
- ref_frame_config->ref_idx[0] = 1;
- ref_frame_config->ref_idx[1] = 4;
- ref_frame_config->refresh[4] = 1;
- } else if (layer_id->spatial_layer_id == 2) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
- // GOLDEN (and all other refs) to slot 4.
- // No update.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
- ref_frame_config->ref_idx[0] = 2;
- }
- } else if ((superframe_cnt_ - 2) % 4 == 0) {
- // Middle temporal enhancement layer.
- layer_id->temporal_layer_id = 1;
- if (layer_id->spatial_layer_id == 0) {
- // Reference LAST.
- // Set all buffer_idx to 0.
- // Set GOLDEN to slot 3 and update slot 3.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
- ref_frame_config->ref_idx[3] = 3;
- ref_frame_config->refresh[3] = 1;
- } else if (layer_id->spatial_layer_id == 1) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
- // GOLDEN (and all other refs) to slot 3.
- // Set LAST2 to slot 4 and update slot 4.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
- ref_frame_config->ref_idx[0] = 1;
- ref_frame_config->ref_idx[2] = 4;
- ref_frame_config->refresh[4] = 1;
- } else if (layer_id->spatial_layer_id == 2) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
- // GOLDEN (and all other refs) to slot 4.
- // Set LAST2 to slot 5 and update slot 5.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
- ref_frame_config->ref_idx[0] = 2;
- ref_frame_config->ref_idx[2] = 5;
- ref_frame_config->refresh[5] = 1;
- }
- } else if ((superframe_cnt_ - 3) % 4 == 0) {
- // Second top temporal enhancement layer.
- layer_id->temporal_layer_id = 2;
- if (layer_id->spatial_layer_id == 0) {
- // Set LAST to slot 3 and reference LAST.
- // Set GOLDEN to slot 3 and update slot 3.
- // Set all other buffer_idx to 0.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
- ref_frame_config->ref_idx[0] = 3;
- ref_frame_config->ref_idx[3] = 3;
- ref_frame_config->refresh[3] = 1;
- } else if (layer_id->spatial_layer_id == 1) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 4,
- // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
- ref_frame_config->ref_idx[0] = 4;
- ref_frame_config->ref_idx[3] = 3;
- ref_frame_config->ref_idx[1] = 4;
- ref_frame_config->refresh[4] = 1;
- } else if (layer_id->spatial_layer_id == 2) {
- // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 5,
- // GOLDEN to slot 4. No update.
- for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
- ref_frame_config->ref_idx[0] = 5;
- ref_frame_config->ref_idx[3] = 4;
- }
- }
- if (layer_id->spatial_layer_id > 0) {
- // Always reference GOLDEN (inter-layer prediction).
- ref_frame_config->reference[3] = 1;
- if (is_key_frame && layer_id->spatial_layer_id > 0) {
- // On superframes whose base is key: remove LAST since GOLDEN
- // is used as reference.
- ref_frame_config->reference[0] = 0;
+ if (simulcast_mode) {
+ ref_config_simulcast3SL3TL(ref_frame_config, layer_id, is_key_frame,
+ superframe_cnt_);
+ } else {
+ ref_config_3SL3TL(ref_frame_config, layer_id, is_key_frame,
+ superframe_cnt_);
+ // Allow for top spatial layer to use additional temporal reference.
+ // Additional reference is only updated on base temporal layer, every
+ // 10 TL0 frames here.
+ if (multi_ref && layer_id->spatial_layer_id == 2) {
+ ref_frame_config->ref_idx[6] = 7;
+ if (!is_key_frame) ref_frame_config->reference[6] = 1;
+ if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+ ref_frame_config->refresh[7] = 1;
}
}
- // Allow for top spatial layer to use additional temporal reference.
- // Additional reference is only updated on base temporal layer, every
- // 10 TL0 frames here.
- if (multi_ref && layer_id->spatial_layer_id == 2) {
- ref_frame_config->ref_idx[6] = 7;
- if (!is_key_frame) ref_frame_config->reference[6] = 1;
- if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
- ref_frame_config->refresh[7] = 1;
- }
}
// If the top spatial layer is first-time encoded in mid-sequence
// (i.e., dynamic_enable_disable_mode = 1), then don't predict from LAST,
@@ -1090,6 +1287,60 @@ class DatarateTestSVC
EXPECT_EQ((int)GetMismatchFrames(), 150);
}
+ virtual void BasicRateTargetingSVC3TL3SLSimulcast() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.kf_max_dist = 150;
+ cfg_.kf_min_dist = 150;
+ int num_frames = 300;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, num_frames);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ simulcast_mode_ = 1;
+ frame_to_start_decoding_ = cfg_.kf_max_dist;
+ layer_to_decode_ = 2; // SL2
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Only SL2 layer is decoded.
+ for (int tl = 0; tl < number_temporal_layers_; tl++) {
+ int i = layer_to_decode_ * number_temporal_layers_ + tl;
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.6)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.7)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Only top spatial layer (SL2) is decoded, starting at frame 150
+ // (frame_to_start_decoding_), so there (300 - 150) / 2 = 75
+ // non-reference frames, so mismatch is 75.
+ int num_mismatch = (num_frames - frame_to_start_decoding_) / 2;
+ EXPECT_EQ((int)GetMismatchFrames(), num_mismatch);
+ }
+
virtual void BasicRateTargetingSVC1TL2SLIntraOnlyTest() {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
@@ -2133,6 +2384,7 @@ class DatarateTestSVC
int screen_mode_;
int rps_mode_;
int rps_recovery_frame_;
+ int simulcast_mode_;
int user_define_frame_qp_;
int frame_qp_;
@@ -2199,6 +2451,14 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll) {
BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll();
}
+// Check simulcast mode for 3 spatial layers, 3 temporal,
+// Key frame is inserted on base SLO in mid-stream, and verify that the
+// top spatial layer (SL2) case be decoded, starting with an Intra-only frame.
+// Verify that we can decode all frames for SL2 with no mismatch.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLSimulcast) {
+ BasicRateTargetingSVC3TL3SLSimulcast();
+}
+
// Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal,
// with Intra-only frame inserted in the stream.
TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SLIntraOnly) {
@@ -2373,6 +2633,39 @@ TEST_P(DatarateTestSVC, BasicRateTargetingRPS1TL1SLDropFrames) {
BasicRateTargetingRPS1TL1SLDropFramesTest();
}
+TEST(SvcParams, BitrateOverflow) {
+ uint8_t buf[6] = { 0 };
+ aom_image_t img;
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, 1, 1, 1, buf));
+
+ aom_codec_iface_t *const iface = aom_codec_av1_cx();
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME),
+ AOM_CODEC_OK);
+ cfg.g_w = 1;
+ cfg.g_h = 1;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ aom_svc_params_t svc_params = {};
+ svc_params.framerate_factor[0] = 1;
+ svc_params.framerate_factor[1] = 2;
+ svc_params.number_spatial_layers = 1;
+ svc_params.number_temporal_layers = 2;
+ svc_params.layer_target_bitrate[0] = INT_MAX;
+ svc_params.layer_target_bitrate[1] = INT_MAX;
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_PARAMS, &svc_params),
+ AOM_CODEC_OK);
+ EXPECT_EQ(
+ aom_codec_encode(&enc, &img, /*pts=*/0, /*duration=*/1, /*flags=*/0),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_encode(&enc, /*img=*/nullptr, /*pts=*/0, /*duration=*/0,
+ /*flags=*/0),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
AV1_INSTANTIATE_TEST_SUITE(DatarateTestSVC,
::testing::Values(::libaom_test::kRealTime),
::testing::Range(7, 12), ::testing::Values(0, 3),
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index e689cd321..85f68b817 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -57,8 +57,8 @@ typedef std::tuple<TemporalFilterFuncParam, int> TemporalFilterWithParam;
class TemporalFilterTest
: public ::testing::TestWithParam<TemporalFilterWithParam> {
public:
- virtual ~TemporalFilterTest() {}
- virtual void SetUp() {
+ ~TemporalFilterTest() override = default;
+ void SetUp() override {
params_ = GET_PARAM(0);
tf_wgt_calc_lvl_ = GET_PARAM(1);
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -71,7 +71,7 @@ class TemporalFilterTest
ASSERT_NE(src2_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src1_);
aom_free(src2_);
}
@@ -308,6 +308,23 @@ INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
Values(0, 1)));
#endif // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
+ TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
+ &av1_apply_temporal_filter_neon_dotprod)
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_neon_dotprod),
+ Values(0, 1)));
+#endif // HAVE_NEON_DOTPROD
+
+#if HAVE_AVX2 || HAVE_NEON
+// Width and height for which av1_estimate_noise_from_single_plane() will be
+// tested.
+const int kWidths[] = { 3840, 1920, 1280, 800, 640, 360, 357 };
+const int kHeights[] = { 2160, 1080, 720, 600, 480, 240, 237 };
+#endif // HAVE_AVX2 || HAVE_NEON
+
typedef double (*EstimateNoiseFunc)(const uint8_t *src, int height, int width,
int stride, int edge_thresh);
@@ -317,8 +334,8 @@ typedef std::tuple<EstimateNoiseFunc, EstimateNoiseFunc, int, int>
class EstimateNoiseTest
: public ::testing::TestWithParam<EstimateNoiseWithParam> {
public:
- virtual ~EstimateNoiseTest() {}
- virtual void SetUp() {
+ ~EstimateNoiseTest() override = default;
+ void SetUp() override {
ref_func = GET_PARAM(0);
tst_func = GET_PARAM(1);
width_ = GET_PARAM(2);
@@ -330,7 +347,7 @@ class EstimateNoiseTest
ASSERT_NE(src1_, nullptr);
}
- virtual void TearDown() { aom_free(src1_); }
+ void TearDown() override { aom_free(src1_); }
void RunTest(int run_times) {
stride_ = width_;
@@ -387,11 +404,6 @@ TEST_P(EstimateNoiseTest, RandomValues) { RunTest(1); }
TEST_P(EstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
#if HAVE_AVX2
-// Width and height for which av1_estimate_noise_from_single_plane() will be
-// tested.
-const int kWidths[] = { 3840, 1920, 1280, 800, 640, 360, 357 };
-const int kHeights[] = { 2160, 1080, 720, 600, 480, 240, 237 };
-
INSTANTIATE_TEST_SUITE_P(
AVX2, EstimateNoiseTest,
::testing::Combine(
@@ -400,6 +412,15 @@ INSTANTIATE_TEST_SUITE_P(
::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
#endif // HAVE_AVX2
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, EstimateNoiseTest,
+ ::testing::Combine(
+ ::testing::Values(av1_estimate_noise_from_single_plane_c),
+ ::testing::Values(av1_estimate_noise_from_single_plane_neon),
+ ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
+#endif // HAVE_NEON
+
#if CONFIG_AV1_HIGHBITDEPTH
typedef void (*HBDTemporalFilterFunc)(
@@ -416,8 +437,8 @@ typedef std::tuple<HBDTemporalFilterFuncParam, int> HBDTemporalFilterWithParam;
class HBDTemporalFilterTest
: public ::testing::TestWithParam<HBDTemporalFilterWithParam> {
public:
- virtual ~HBDTemporalFilterTest() {}
- virtual void SetUp() {
+ ~HBDTemporalFilterTest() override = default;
+ void SetUp() override {
params_ = GET_PARAM(0);
tf_wgt_calc_lvl_ = GET_PARAM(1);
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -430,7 +451,7 @@ class HBDTemporalFilterTest
ASSERT_NE(src2_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src1_);
aom_free(src2_);
}
@@ -664,6 +685,104 @@ INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
Combine(ValuesIn(HBDtemporal_filter_test_avx2),
Values(0, 1)));
#endif // HAVE_AVX2
+
+#if HAVE_NEON
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
+ HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+ &av1_highbd_apply_temporal_filter_neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
+ Combine(ValuesIn(HBDtemporal_filter_test_neon),
+ Values(0, 1)));
+#endif // HAVE_NEON
+
+using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height,
+ int width, int stride, int bit_depth,
+ int edge_thresh);
+
+using HBDEstimateNoiseWithParam =
+ std::tuple<HBDEstimateNoiseFunc, HBDEstimateNoiseFunc, int, int, int>;
+
+class HBDEstimateNoiseTest
+ : public ::testing::TestWithParam<HBDEstimateNoiseWithParam> {
+ public:
+ HBDEstimateNoiseTest()
+ : ref_func_(GET_PARAM(0)), tst_func_(GET_PARAM(1)),
+ rnd_(libaom_test::ACMRandom::DeterministicSeed()), width_(GET_PARAM(2)),
+ height_(GET_PARAM(3)), bitdepth_(GET_PARAM(4)) {}
+ ~HBDEstimateNoiseTest() override = default;
+ void SetUp() override {
+ src1_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * width_ * height_));
+ ASSERT_NE(src1_, nullptr);
+ GenRandomData(width_ * height_);
+ }
+
+ void TearDown() override { aom_free(src1_); }
+
+ void RunTest() {
+ stride_ = width_;
+
+ double ref_out = ref_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ double tst_out = tst_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ EXPECT_EQ(ref_out, tst_out);
+ }
+
+ void SpeedTest(int run_times) {
+ stride_ = width_;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ ref_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ tst_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %dx%d :%7.2f/%7.2f (%3.2f)\n", bitdepth_, width_, height_, time1,
+ time2, time1 / time2);
+ }
+
+ void GenRandomData(int size) {
+ for (int ii = 0; ii < size; ii++) src1_[ii] = rnd_.Rand12();
+ }
+
+ private:
+ HBDEstimateNoiseFunc ref_func_;
+ HBDEstimateNoiseFunc tst_func_;
+ ACMRandom rnd_;
+ uint16_t *src1_;
+ int width_;
+ int height_;
+ int stride_;
+ int bitdepth_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HBDEstimateNoiseTest);
+
+TEST_P(HBDEstimateNoiseTest, RandomValues) { RunTest(); }
+
+TEST_P(HBDEstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HBDEstimateNoiseTest,
+ ::testing::Combine(
+ ::testing::Values(av1_highbd_estimate_noise_from_single_plane_c),
+ ::testing::Values(av1_highbd_estimate_noise_from_single_plane_neon),
+ ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights),
+ ::testing::ValuesIn({ 8, 10, 12 })));
+#endif // HAVE_NEON
#endif // CONFIG_AV1_HIGHBITDEPTH
} // namespace
#endif
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 4bd0ddc09..4b4a96d44 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -571,3 +571,5 @@ c7f336958e7af6162c20ddc84d67c7dfa9826910 *av1-1-b8-16-intra_only-intrabc-extreme
36a4fcf07e645ed522cde5845dd9c6ab2b2d1502 *av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf.md5
9f935d391fdf4a6f7c320355d45770d2e7d6095c *desktopqvga2.320_240.yuv
4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
+ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
+9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
diff --git a/test/test.cmake b/test/test.cmake
index 672edb3f1..2ca7e641c 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -157,21 +157,35 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/simd_cmp_impl.h"
"${AOM_ROOT}/test/simd_impl.h")
- list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
- "${AOM_ROOT}/test/simd_cmp_neon.cc")
- add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_NEON)
+ if(HAVE_NEON)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/test/simd_cmp_neon.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_NEON)
+ endif()
- list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
- "${AOM_ROOT}/test/simd_cmp_sse2.cc")
- add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE2)
+ if(HAVE_SSE2)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
+ "${AOM_ROOT}/test/simd_cmp_sse2.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE2)
+ endif()
- list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
- "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
- add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSSE3)
+ if(HAVE_SSSE3)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSSE3)
+ endif()
- list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
- "${AOM_ROOT}/test/simd_cmp_avx2.cc")
- add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_AVX2)
+ if(HAVE_SSE4_1)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/test/simd_cmp_sse4.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1)
+ endif()
+
+ if(HAVE_AVX2)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/test/simd_cmp_avx2.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_AVX2)
+ endif()
list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
"${AOM_ROOT}/test/arf_freq_test.cc"
@@ -180,6 +194,7 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
"${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
"${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+ "${AOM_ROOT}/test/av1_k_means_test.cc"
"${AOM_ROOT}/test/av1_nn_predict_test.cc"
"${AOM_ROOT}/test/av1_round_shift_array_test.cc"
"${AOM_ROOT}/test/av1_softmax_test.cc"
@@ -192,15 +207,16 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/comp_avg_pred_test.cc"
"${AOM_ROOT}/test/comp_avg_pred_test.h"
"${AOM_ROOT}/test/comp_mask_pred_test.cc"
+ "${AOM_ROOT}/test/disflow_test.cc"
"${AOM_ROOT}/test/encodemb_test.cc"
"${AOM_ROOT}/test/encodetxb_test.cc"
"${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
"${AOM_ROOT}/test/end_to_end_ssim_test.cc"
"${AOM_ROOT}/test/error_block_test.cc"
+ "${AOM_ROOT}/test/fdct4x4_test.cc"
"${AOM_ROOT}/test/fft_test.cc"
"${AOM_ROOT}/test/firstpass_test.cc"
"${AOM_ROOT}/test/fwht4x4_test.cc"
- "${AOM_ROOT}/test/fdct4x4_test.cc"
"${AOM_ROOT}/test/hadamard_test.cc"
"${AOM_ROOT}/test/horver_correlation_test.cc"
"${AOM_ROOT}/test/masked_sad_test.cc"
@@ -212,23 +228,17 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/obmc_sad_test.cc"
"${AOM_ROOT}/test/obmc_variance_test.cc"
"${AOM_ROOT}/test/pickrst_test.cc"
+ "${AOM_ROOT}/test/reconinter_test.cc"
"${AOM_ROOT}/test/sad_test.cc"
"${AOM_ROOT}/test/subtract_test.cc"
- "${AOM_ROOT}/test/reconinter_test.cc"
"${AOM_ROOT}/test/sum_squares_test.cc"
"${AOM_ROOT}/test/sse_sum_test.cc"
"${AOM_ROOT}/test/variance_test.cc"
- "${AOM_ROOT}/test/wiener_test.cc"
- "${AOM_ROOT}/test/frame_error_test.cc"
"${AOM_ROOT}/test/warp_filter_test.cc"
"${AOM_ROOT}/test/warp_filter_test_util.cc"
"${AOM_ROOT}/test/warp_filter_test_util.h"
"${AOM_ROOT}/test/webmenc_test.cc"
- "${AOM_ROOT}/test/av1_k_means_test.cc")
-
- list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
- "${AOM_ROOT}/test/simd_cmp_sse4.cc")
- add_to_libaom_test_srcs(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1)
+ "${AOM_ROOT}/test/wiener_test.cc")
if(NOT CONFIG_REALTIME_ONLY)
list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
@@ -313,7 +323,7 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/simd_ssse3_test.cc")
endif()
- if(HAVE_SSE4)
+ if(HAVE_SSE4_1)
list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
"${AOM_ROOT}/test/simd_sse4_test.cc")
endif()
@@ -351,13 +361,13 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/av1_convolve_scale_test.cc"
"${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
"${AOM_ROOT}/test/intra_edge_test.cc")
-
endif()
if(HAVE_NEON)
list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
"${AOM_ROOT}/test/av1_convolve_scale_test.cc"
- "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc")
+ "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
+ "${AOM_ROOT}/test/intra_edge_test.cc")
endif()
if(HAVE_SSE4_2 OR HAVE_ARM_CRC32)
@@ -366,10 +376,10 @@ if(NOT BUILD_SHARED_LIBS)
if(CONFIG_REALTIME_ONLY)
list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/disflow_test.cc"
"${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
"${AOM_ROOT}/test/end_to_end_ssim_test.cc"
"${AOM_ROOT}/test/firstpass_test.cc"
- "${AOM_ROOT}/test/frame_error_test.cc"
"${AOM_ROOT}/test/motion_vector_test.cc"
"${AOM_ROOT}/test/obmc_sad_test.cc"
"${AOM_ROOT}/test/obmc_variance_test.cc"
@@ -498,9 +508,6 @@ function(setup_aom_test_targets)
target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest)
- if(CONFIG_LIBYUV)
- target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>)
- endif()
if(CONFIG_WEBM_IO)
target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:webm>)
endif()
@@ -530,10 +537,6 @@ function(setup_aom_test_targets)
add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
"AOM_UNIT_TEST_COMMON_INTRIN_NEON")
endif()
- if(HAVE_ARM_CRC32)
- add_intrinsics_source_to_target("${AOM_ARM_CRC32_FLAG}" "test_libaom"
- "AOM_UNIT_TEST_COMMON_INTRIN_CRC32")
- endif()
if(ENABLE_TESTDATA)
make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files
diff --git a/test/test_data_util.cmake b/test/test_data_util.cmake
index de7d1535c..069e1ad52 100644
--- a/test/test_data_util.cmake
+++ b/test/test_data_util.cmake
@@ -39,7 +39,9 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
"vase10x10.yuv"
"vase10x10_tiles.txt"
"bus_352x288_420_f20_b8.yuv"
- "test_input_w1h1.yuv")
+ "test_input_w1h1.yuv"
+ "crowd_run_360p_10_150f.y4m"
+ "wikipedia_420_360p_60f.y4m")
if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv")
diff --git a/test/test_libaom.cc b/test/test_libaom.cc
index 6ffbbc516..fbd7f2e38 100644
--- a/test/test_libaom.cc
+++ b/test/test_libaom.cc
@@ -9,24 +9,29 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include <string.h>
-
-#include <string>
-
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "config/aom_config.h"
+#if !CONFIG_SHARED
+#include <string.h>
+
+#include <string>
+
+#if AOM_ARCH_ARM
+#include "aom_ports/arm.h"
+#endif
#if AOM_ARCH_X86 || AOM_ARCH_X86_64
#include "aom_ports/x86.h"
#endif
+
extern "C" {
extern void av1_rtcd();
extern void aom_dsp_rtcd();
extern void aom_scale_rtcd();
}
-#if AOM_ARCH_X86 || AOM_ARCH_X86_64
+#if AOM_ARCH_ARM || AOM_ARCH_X86 || AOM_ARCH_X86_64
static void append_negative_gtest_filter(const char *str) {
std::string flag_value = GTEST_FLAG_GET(filter);
// Negative patterns begin with one '-' followed by a ':' separated list.
@@ -44,11 +49,24 @@ static void append_negative_gtest_filter(const char *str) {
}
GTEST_FLAG_SET(filter, flag_value);
}
-#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64
+#endif // AOM_ARCH_ARM || AOM_ARCH_X86 || AOM_ARCH_X86_64
+#endif // !CONFIG_SHARED
int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
+#if !CONFIG_SHARED
+#if AOM_ARCH_AARCH64
+ const int caps = aom_arm_cpu_caps();
+ if (!(caps & HAS_ARM_CRC32)) append_negative_gtest_filter("ARM_CRC32");
+ if (!(caps & HAS_NEON_DOTPROD)) append_negative_gtest_filter("NEON_DOTPROD");
+ if (!(caps & HAS_NEON_I8MM)) append_negative_gtest_filter("NEON_I8MM");
+ if (!(caps & HAS_SVE)) append_negative_gtest_filter("SVE");
+#elif AOM_ARCH_ARM
+ const int caps = aom_arm_cpu_caps();
+ if (!(caps & HAS_NEON)) append_negative_gtest_filter("NEON");
+#endif // AOM_ARCH_ARM
+
#if AOM_ARCH_X86 || AOM_ARCH_X86_64
const int simd_caps = x86_simd_caps();
if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter("MMX");
@@ -62,9 +80,8 @@ int main(int argc, char **argv) {
if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter("AVX2");
#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64
-// Shared library builds don't support whitebox tests that exercise internal
-// symbols.
-#if !CONFIG_SHARED
+ // Shared library builds don't support whitebox tests that exercise internal
+ // symbols.
av1_rtcd();
aom_dsp_rtcd();
aom_scale_rtcd();
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 87155c7f3..39414e32e 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -41,7 +41,7 @@ class TestVectorTest : public ::libaom_test::DecoderTest,
protected:
TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) {}
- virtual ~TestVectorTest() {
+ ~TestVectorTest() override {
if (md5_file_) fclose(md5_file_);
}
@@ -51,14 +51,13 @@ class TestVectorTest : public ::libaom_test::DecoderTest,
<< "Md5 file open failed. Filename: " << md5_file_name_;
}
- virtual void PreDecodeFrameHook(
- const libaom_test::CompressedVideoSource &video,
- libaom_test::Decoder *decoder) {
+ void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) override {
if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_);
}
- virtual void DecompressedFrameHook(const aom_image_t &img,
- const unsigned int frame_number) {
+ void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int frame_number) override {
ASSERT_NE(md5_file_, nullptr);
char expected_md5[33];
char junk[128];
diff --git a/test/tile_config_test.cc b/test/tile_config_test.cc
index 517d54bd9..e2ac59284 100644
--- a/test/tile_config_test.cc
+++ b/test/tile_config_test.cc
@@ -82,9 +82,9 @@ class UniformTileConfigTestLarge
max_tile_cols_log2_ = tile_log2(1, AOM_MAX_TILE_COLS);
max_tile_rows_log2_ = tile_log2(1, AOM_MAX_TILE_ROWS);
}
- virtual ~UniformTileConfigTestLarge() {}
+ ~UniformTileConfigTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -93,10 +93,10 @@ class UniformTileConfigTestLarge
cfg_.g_lag_in_frames = 19;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_TILE_COLUMNS, tile_config_param_.tile_cols);
encoder->Control(AV1E_SET_TILE_ROWS, tile_config_param_.tile_rows);
@@ -109,8 +109,8 @@ class UniformTileConfigTestLarge
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -148,9 +148,9 @@ class NonUniformTileConfigTestLarge
tile_config_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
tile_config_violated_ = false;
}
- virtual ~NonUniformTileConfigTestLarge() {}
+ ~NonUniformTileConfigTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -168,10 +168,10 @@ class NonUniformTileConfigTestLarge
tile_config_param_.tile_height_count);
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -182,8 +182,8 @@ class NonUniformTileConfigTestLarge
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -302,9 +302,9 @@ class TileGroupTestLarge
tile_group_config_params_(GET_PARAM(2)) {
tile_group_config_violated_ = false;
}
- virtual ~TileGroupTestLarge() {}
+ ~TileGroupTestLarge() override = default;
- virtual void SetUp() {
+ void SetUp() override {
InitializeConfig(encoding_mode_);
const aom_rational timebase = { 1, 30 };
cfg_.g_timebase = timebase;
@@ -312,10 +312,10 @@ class TileGroupTestLarge
cfg_.g_threads = 1;
}
- virtual bool DoDecode() const { return 1; }
+ bool DoDecode() const override { return true; }
- virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
- ::libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AOME_SET_CPUUSED, 5);
encoder->Control(AV1E_SET_NUM_TG, tile_group_config_params_.num_tg);
@@ -326,8 +326,8 @@ class TileGroupTestLarge
}
}
- virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
- libaom_test::Decoder *decoder) {
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
if (AOM_CODEC_OK == res_dec) {
aom_tile_info tile_info;
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 888c3abc9..84406dd3f 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -47,15 +47,15 @@ class TileIndependenceTest
}
}
- virtual ~TileIndependenceTest() {
+ ~TileIndependenceTest() override {
delete fw_dec_;
delete inv_dec_;
}
- virtual void SetUp() { InitializeConfig(libaom_test::kTwoPassGood); }
+ void SetUp() override { InitializeConfig(libaom_test::kTwoPassGood); }
- virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
- libaom_test::Encoder *encoder) {
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
if (video->frame() == 0) {
encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
@@ -82,7 +82,7 @@ class TileIndependenceTest
md5->Add(img);
}
- virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
}
@@ -123,7 +123,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
}
class TileIndependenceTestLarge : public TileIndependenceTest {
- virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+ void SetCpuUsed(libaom_test::Encoder *encoder) override {
static const int kCpuUsed = 0;
encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
}
diff --git a/test/time_stamp_test.cc b/test/time_stamp_test.cc
index baa0dc06d..5de98b719 100644
--- a/test/time_stamp_test.cc
+++ b/test/time_stamp_test.cc
@@ -47,16 +47,16 @@ class DummyTimebaseVideoSource : public ::libaom_test::DummyVideoSource {
(static_cast<double>(framerate_numerator_) / framerate_denominator_);
}
- virtual aom_codec_pts_t pts() const {
+ aom_codec_pts_t pts() const override {
return static_cast<aom_codec_pts_t>(frame_ * FrameDuration() +
starting_pts_ + 0.5);
}
- virtual unsigned long duration() const {
+ unsigned long duration() const override {
return static_cast<unsigned long>(FrameDuration() + 0.5);
}
- virtual aom_rational_t timebase() const { return timebase_; }
+ aom_rational_t timebase() const override { return timebase_; }
void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
@@ -72,9 +72,9 @@ class TimestampTest
public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
protected:
TimestampTest() : EncoderTest(GET_PARAM(0)) {}
- virtual ~TimestampTest() {}
+ ~TimestampTest() override = default;
- virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
};
// Tests encoding in millisecond timebase.
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 260f4ffef..55e78fef4 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -12,11 +12,12 @@
#ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_
#define AOM_TEST_TRANSFORM_TEST_BASE_H_
-#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "aom_mem/aom_mem.h"
#include "aom/aom_codec.h"
#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "test/acm_random.h"
namespace libaom_test {
@@ -40,7 +41,7 @@ using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
template <typename OutType>
class TransformTestBase {
public:
- virtual ~TransformTestBase() {}
+ virtual ~TransformTestBase() = default;
protected:
virtual void RunFwdTxfm(const int16_t *in, OutType *out, int stride) = 0;
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 2863aea74..adca1b10d 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -348,7 +348,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
public:
SumOfSquaresTest() : func_(GetParam()) {}
- virtual ~SumOfSquaresTest() {}
+ ~SumOfSquaresTest() override = default;
protected:
void ConstTest();
@@ -427,7 +427,7 @@ template <typename FunctionType>
class MseWxHTestClass
: public ::testing::TestWithParam<TestParams<FunctionType> > {
public:
- virtual void SetUp() {
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -439,7 +439,7 @@ class MseWxHTestClass
ASSERT_NE(dst_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(dst_);
src_ = nullptr;
@@ -528,7 +528,7 @@ class Mse16xHTestClass
// Memory required to compute mse of two 8x8 and four 4x4 blocks assigned for
// maximum width 16 and maximum height 8.
int mem_size = 16 * 8;
- virtual void SetUp() {
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
src_ = reinterpret_cast<uint16_t *>(
@@ -539,7 +539,7 @@ class Mse16xHTestClass
ASSERT_NE(dst_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(dst_);
src_ = nullptr;
@@ -659,7 +659,7 @@ template <typename FunctionType>
class MainTestClass
: public ::testing::TestWithParam<TestParams<FunctionType> > {
public:
- virtual void SetUp() {
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -678,7 +678,7 @@ class MainTestClass
}
}
- virtual void TearDown() {
+ void TearDown() override {
if (use_high_bit_depth()) {
// TODO(skal): remove!
src_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(src_));
@@ -1286,7 +1286,7 @@ template <typename FunctionType>
class SubpelVarianceTest
: public ::testing::TestWithParam<TestParams<FunctionType> > {
public:
- virtual void SetUp() {
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -1308,7 +1308,7 @@ class SubpelVarianceTest
ASSERT_NE(ref_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
if (!use_high_bit_depth()) {
aom_free(src_);
aom_free(ref_);
@@ -1544,7 +1544,7 @@ template <typename FunctionType>
class ObmcVarianceTest
: public ::testing::TestWithParam<TestParams<FunctionType> > {
public:
- virtual void SetUp() {
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -1553,7 +1553,7 @@ class ObmcVarianceTest
aom_memalign(32, block_size() + width() + height() + 1));
} else {
pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(aom_memalign(
- 32, block_size() + width() + height() + 1 * sizeof(uint16_t))));
+ 32, (block_size() + width() + height() + 1) * sizeof(uint16_t))));
}
wsrc_ = reinterpret_cast<int32_t *>(
aom_memalign(32, block_size() * sizeof(uint32_t)));
@@ -1564,7 +1564,7 @@ class ObmcVarianceTest
ASSERT_NE(mask_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
if (!use_high_bit_depth()) {
aom_free(pre_);
} else {
@@ -1635,7 +1635,8 @@ void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() {
memset(pre_ + half, 0, half + width() + height() + 1);
} else {
aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half);
- aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half);
+ aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0,
+ half + width() + height() + 1);
}
for (int j = 0; j < half; j++) {
wsrc_[j] = bd_mask() * kMaskMax * kMaskMax;
@@ -1962,7 +1963,7 @@ template <typename FunctionType>
class MseHBDWxHTestClass
: public ::testing::TestWithParam<TestParams<FunctionType> > {
public:
- virtual void SetUp() {
+ void SetUp() override {
params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -1974,7 +1975,7 @@ class MseHBDWxHTestClass
ASSERT_NE(dst_, nullptr);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_);
aom_free(dst_);
src_ = nullptr;
@@ -2062,6 +2063,8 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AvxHBDMseTest);
typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
+ AvxHBDDistWtdSubpelAvgVarianceTest;
#if !CONFIG_REALTIME_ONLY
typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
#endif
@@ -2081,6 +2084,12 @@ TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
+#if !CONFIG_REALTIME_ONLY
+TEST_P(AvxHBDObmcSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+#endif
INSTANTIATE_TEST_SUITE_P(
C, MseHBDWxHTest,
@@ -2106,6 +2115,14 @@ INSTANTIATE_TEST_SUITE_P(
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(
+ NEON, MseHBDWxHTest,
+ ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_neon, 10),
+ MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_neon, 10),
+ MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_neon, 10),
+ MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_neon,
+ 10)));
+
+INSTANTIATE_TEST_SUITE_P(
NEON, AvxHBDMseTest,
::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_neon, 12),
MseParams(4, 3, &aom_highbd_12_mse16x8_neon, 12),
@@ -2121,6 +2138,15 @@ INSTANTIATE_TEST_SUITE_P(
MseParams(3, 3, &aom_highbd_8_mse8x8_neon, 8)));
#endif // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AvxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_highbd_8_mse16x16_neon_dotprod, 8),
+ MseParams(4, 3, &aom_highbd_8_mse16x8_neon_dotprod, 8),
+ MseParams(3, 4, &aom_highbd_8_mse8x16_neon_dotprod, 8),
+ MseParams(3, 3, &aom_highbd_8_mse8x8_neon_dotprod, 8)));
+#endif // HAVE_NEON_DOTPROD
+
const VarianceParams kArrayHBDVariance_c[] = {
VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12),
VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12),
@@ -2389,27 +2415,177 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest,
::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
+const DistWtdSubpelAvgVarianceParams kArrayHBDDistWtdSubpelAvgVariance_c[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_c, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(
+ C, AvxHBDDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDDistWtdSubpelAvgVariance_c));
+
#if !CONFIG_REALTIME_ONLY
const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
- ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
+ ObmcSubpelVarianceParams(7, 7, &aom_highbd_8_obmc_sub_pixel_variance128x128_c,
+ 8),
+ ObmcSubpelVarianceParams(7, 6, &aom_highbd_8_obmc_sub_pixel_variance128x64_c,
+ 8),
+ ObmcSubpelVarianceParams(6, 7, &aom_highbd_8_obmc_sub_pixel_variance64x128_c,
+ 8),
+ ObmcSubpelVarianceParams(6, 6, &aom_highbd_8_obmc_sub_pixel_variance64x64_c,
+ 8),
+ ObmcSubpelVarianceParams(6, 5, &aom_highbd_8_obmc_sub_pixel_variance64x32_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 6, &aom_highbd_8_obmc_sub_pixel_variance32x64_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 5, &aom_highbd_8_obmc_sub_pixel_variance32x32_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 4, &aom_highbd_8_obmc_sub_pixel_variance32x16_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 5, &aom_highbd_8_obmc_sub_pixel_variance16x32_c,
8),
- ObmcSubpelVarianceParams(7, 6, &aom_highbd_obmc_sub_pixel_variance128x64_c,
+ ObmcSubpelVarianceParams(4, 4, &aom_highbd_8_obmc_sub_pixel_variance16x16_c,
8),
- ObmcSubpelVarianceParams(6, 7, &aom_highbd_obmc_sub_pixel_variance64x128_c,
+ ObmcSubpelVarianceParams(4, 3, &aom_highbd_8_obmc_sub_pixel_variance16x8_c,
8),
- ObmcSubpelVarianceParams(6, 6, &aom_highbd_obmc_sub_pixel_variance64x64_c, 8),
- ObmcSubpelVarianceParams(6, 5, &aom_highbd_obmc_sub_pixel_variance64x32_c, 8),
- ObmcSubpelVarianceParams(5, 6, &aom_highbd_obmc_sub_pixel_variance32x64_c, 8),
- ObmcSubpelVarianceParams(5, 5, &aom_highbd_obmc_sub_pixel_variance32x32_c, 8),
- ObmcSubpelVarianceParams(5, 4, &aom_highbd_obmc_sub_pixel_variance32x16_c, 8),
- ObmcSubpelVarianceParams(4, 5, &aom_highbd_obmc_sub_pixel_variance16x32_c, 8),
- ObmcSubpelVarianceParams(4, 4, &aom_highbd_obmc_sub_pixel_variance16x16_c, 8),
- ObmcSubpelVarianceParams(4, 3, &aom_highbd_obmc_sub_pixel_variance16x8_c, 8),
- ObmcSubpelVarianceParams(3, 4, &aom_highbd_obmc_sub_pixel_variance8x16_c, 8),
- ObmcSubpelVarianceParams(3, 3, &aom_highbd_obmc_sub_pixel_variance8x8_c, 8),
- ObmcSubpelVarianceParams(3, 2, &aom_highbd_obmc_sub_pixel_variance8x4_c, 8),
- ObmcSubpelVarianceParams(2, 3, &aom_highbd_obmc_sub_pixel_variance4x8_c, 8),
- ObmcSubpelVarianceParams(2, 2, &aom_highbd_obmc_sub_pixel_variance4x4_c, 8),
+ ObmcSubpelVarianceParams(3, 4, &aom_highbd_8_obmc_sub_pixel_variance8x16_c,
+ 8),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_8_obmc_sub_pixel_variance8x8_c, 8),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_8_obmc_sub_pixel_variance8x4_c, 8),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_8_obmc_sub_pixel_variance4x8_c, 8),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_8_obmc_sub_pixel_variance4x4_c, 8),
ObmcSubpelVarianceParams(7, 7,
&aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10),
ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c,
@@ -2475,12 +2651,18 @@ const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c,
12),
- ObmcSubpelVarianceParams(6, 4, &aom_highbd_obmc_sub_pixel_variance64x16_c, 8),
- ObmcSubpelVarianceParams(4, 6, &aom_highbd_obmc_sub_pixel_variance16x64_c, 8),
- ObmcSubpelVarianceParams(5, 3, &aom_highbd_obmc_sub_pixel_variance32x8_c, 8),
- ObmcSubpelVarianceParams(3, 5, &aom_highbd_obmc_sub_pixel_variance8x32_c, 8),
- ObmcSubpelVarianceParams(4, 2, &aom_highbd_obmc_sub_pixel_variance16x4_c, 8),
- ObmcSubpelVarianceParams(2, 4, &aom_highbd_obmc_sub_pixel_variance4x16_c, 8),
+ ObmcSubpelVarianceParams(6, 4, &aom_highbd_8_obmc_sub_pixel_variance64x16_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 6, &aom_highbd_8_obmc_sub_pixel_variance16x64_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 3, &aom_highbd_8_obmc_sub_pixel_variance32x8_c,
+ 8),
+ ObmcSubpelVarianceParams(3, 5, &aom_highbd_8_obmc_sub_pixel_variance8x32_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 2, &aom_highbd_8_obmc_sub_pixel_variance16x4_c,
+ 8),
+ ObmcSubpelVarianceParams(2, 4, &aom_highbd_8_obmc_sub_pixel_variance4x16_c,
+ 8),
ObmcSubpelVarianceParams(6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_c,
10),
ObmcSubpelVarianceParams(4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_c,
@@ -2772,6 +2954,12 @@ const VarianceParams kArrayHBDVariance_avx2[] = {
VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_highbd_10_variance64x16_avx2, 10),
+ VarianceParams(5, 3, &aom_highbd_10_variance32x8_avx2, 10),
+ VarianceParams(4, 6, &aom_highbd_10_variance16x64_avx2, 10),
+ VarianceParams(3, 5, &aom_highbd_10_variance8x32_avx2, 10),
+#endif
};
INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
@@ -3247,6 +3435,16 @@ INSTANTIATE_TEST_SUITE_P(
MseWxHParams(2, 3, &aom_mse_wxh_16bit_neon, 8),
MseWxHParams(2, 2, &aom_mse_wxh_16bit_neon, 8)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, Mse16xHTest,
+ ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_neon, 8),
+ Mse16xHParams(3, 2, &aom_mse_16xh_16bit_neon, 8),
+ Mse16xHParams(2, 3, &aom_mse_16xh_16bit_neon, 8),
+ Mse16xHParams(2, 2, &aom_mse_16xh_16bit_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(NEON, SumOfSquaresTest,
+ ::testing::Values(aom_get_mb_ss_neon));
+
INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest,
::testing::Values(MseParams(3, 3, &aom_mse8x8_neon),
MseParams(3, 4, &aom_mse8x16_neon),
@@ -3342,6 +3540,52 @@ const SubpelAvgVarianceParams kArraySubpelAvgVariance_neon[] = {
INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelAvgVarianceTest,
::testing::ValuesIn(kArraySubpelAvgVariance_neon));
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_neon[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_neon, 0),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AvxDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_neon));
+
#if !CONFIG_REALTIME_ONLY
const ObmcSubpelVarianceParams kArrayObmcSubpelVariance_neon[] = {
ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_neon, 0),
@@ -3463,8 +3707,559 @@ const VarianceParams kArrayHBDVariance_neon[] = {
INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDVarianceTest,
::testing::ValuesIn(kArrayHBDVariance_neon));
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_neon[] = {
+ SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_neon, 12),
+ SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_neon, 12),
+ SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_neon, 12),
+ SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_neon, 12),
+ SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_neon, 12),
+ SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_neon, 12),
+ SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_neon, 12),
+ SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_neon, 12),
+ SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_neon, 12),
+ SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_neon, 12),
+ SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_neon, 12),
+ SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_neon, 12),
+ SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_neon, 12),
+ SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_neon, 10),
+ SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_neon, 10),
+ SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_neon, 10),
+ SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_neon, 10),
+ SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_neon, 10),
+ SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_neon, 10),
+ SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_neon, 10),
+ SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_neon, 10),
+ SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_neon, 10),
+ SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_neon, 10),
+ SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_neon, 10),
+ SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_neon, 10),
+ SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_neon, 10),
+ SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_neon, 8),
+ SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_neon, 8),
+ SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_neon, 8),
+ SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_neon, 8),
+ SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_neon, 8),
+ SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_neon, 8),
+ SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_neon, 8),
+ SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_neon, 8),
+ SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_neon, 8),
+ SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_neon, 8),
+ SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_neon, 8),
+ SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_neon, 8),
+ SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_neon, 8),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_neon, 8),
+ SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_neon, 8),
+ SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_neon, 8),
+ SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_neon, 8),
+ SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_neon, 8),
+ SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_neon, 8),
+ SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_neon, 10),
+ SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_neon, 10),
+ SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_neon, 10),
+ SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_neon, 10),
+ SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_neon, 10),
+ SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_neon, 10),
+ SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_neon, 12),
+ SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_neon, 12),
+ SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_neon, 12),
+ SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_neon, 12),
+ SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_neon, 12),
+ SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_neon, 12),
+#endif //! CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelVariance_neon));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_neon[] = {
+ SubpelAvgVarianceParams(7, 7,
+ &aom_highbd_8_sub_pixel_avg_variance128x128_neon, 8),
+ SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_neon,
+ 8),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_neon,
+ 8),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_neon,
+ 8),
+ SubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_neon, 10),
+ SubpelAvgVarianceParams(7, 6,
+ &aom_highbd_10_sub_pixel_avg_variance128x64_neon, 10),
+ SubpelAvgVarianceParams(6, 7,
+ &aom_highbd_10_sub_pixel_avg_variance64x128_neon, 10),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_neon,
+ 10),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_neon,
+ 10),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_neon,
+ 10),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_neon,
+ 10),
+ SubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_neon, 12),
+ SubpelAvgVarianceParams(7, 6,
+ &aom_highbd_12_sub_pixel_avg_variance128x64_neon, 12),
+ SubpelAvgVarianceParams(6, 7,
+ &aom_highbd_12_sub_pixel_avg_variance64x128_neon, 12),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_neon,
+ 12),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_neon,
+ 12),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_neon,
+ 12),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_neon,
+ 12),
+
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_neon,
+ 8),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_8_sub_pixel_avg_variance4x16_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_neon,
+ 10),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_10_sub_pixel_avg_variance4x16_neon,
+ 10),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_neon,
+ 12),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_neon,
+ 12),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_neon));
+
+const DistWtdSubpelAvgVarianceParams
+ kArrayHBDDistWtdSubpelAvgVariance_neon[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_neon, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_neon, 12),
+#endif // !CONFIG_REALTIME_ONLY
+ };
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AvxHBDDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDDistWtdSubpelAvgVariance_neon));
+
+#if !CONFIG_REALTIME_ONLY
+const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_neon[] = {
+ ObmcSubpelVarianceParams(
+ 7, 7, &aom_highbd_12_obmc_sub_pixel_variance128x128_neon, 12),
+ ObmcSubpelVarianceParams(
+ 7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_neon, 12),
+ ObmcSubpelVarianceParams(
+ 6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_neon, 12),
+ ObmcSubpelVarianceParams(
+ 6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_neon, 12),
+ ObmcSubpelVarianceParams(
+ 6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_neon, 12),
+ ObmcSubpelVarianceParams(
+ 5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_neon, 12),
+ ObmcSubpelVarianceParams(
+ 5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_neon, 12),
+ ObmcSubpelVarianceParams(
+ 5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_neon, 12),
+ ObmcSubpelVarianceParams(
+ 4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_neon, 12),
+ ObmcSubpelVarianceParams(
+ 4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_neon, 12),
+ ObmcSubpelVarianceParams(4, 3,
+ &aom_highbd_12_obmc_sub_pixel_variance16x8_neon, 12),
+ ObmcSubpelVarianceParams(3, 4,
+ &aom_highbd_12_obmc_sub_pixel_variance8x16_neon, 12),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_neon,
+ 12),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_neon,
+ 12),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_neon,
+ 12),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_neon,
+ 12),
+ ObmcSubpelVarianceParams(
+ 6, 4, &aom_highbd_12_obmc_sub_pixel_variance64x16_neon, 12),
+ ObmcSubpelVarianceParams(
+ 4, 6, &aom_highbd_12_obmc_sub_pixel_variance16x64_neon, 12),
+ ObmcSubpelVarianceParams(5, 3,
+ &aom_highbd_12_obmc_sub_pixel_variance32x8_neon, 12),
+ ObmcSubpelVarianceParams(3, 5,
+ &aom_highbd_12_obmc_sub_pixel_variance8x32_neon, 12),
+ ObmcSubpelVarianceParams(4, 2,
+ &aom_highbd_12_obmc_sub_pixel_variance16x4_neon, 12),
+ ObmcSubpelVarianceParams(2, 4,
+ &aom_highbd_12_obmc_sub_pixel_variance4x16_neon, 12),
+ ObmcSubpelVarianceParams(
+ 7, 7, &aom_highbd_10_obmc_sub_pixel_variance128x128_neon, 10),
+ ObmcSubpelVarianceParams(
+ 7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_neon, 10),
+ ObmcSubpelVarianceParams(
+ 6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_neon, 10),
+ ObmcSubpelVarianceParams(
+ 6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_neon, 10),
+ ObmcSubpelVarianceParams(
+ 6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_neon, 10),
+ ObmcSubpelVarianceParams(
+ 5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_neon, 10),
+ ObmcSubpelVarianceParams(
+ 5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_neon, 10),
+ ObmcSubpelVarianceParams(
+ 5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_neon, 10),
+ ObmcSubpelVarianceParams(
+ 4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_neon, 10),
+ ObmcSubpelVarianceParams(
+ 4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_neon, 10),
+ ObmcSubpelVarianceParams(4, 3,
+ &aom_highbd_10_obmc_sub_pixel_variance16x8_neon, 10),
+ ObmcSubpelVarianceParams(3, 4,
+ &aom_highbd_10_obmc_sub_pixel_variance8x16_neon, 10),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_neon,
+ 10),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_neon,
+ 10),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_neon,
+ 10),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_neon,
+ 10),
+ ObmcSubpelVarianceParams(
+ 6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_neon, 10),
+ ObmcSubpelVarianceParams(
+ 4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_neon, 10),
+ ObmcSubpelVarianceParams(5, 3,
+ &aom_highbd_10_obmc_sub_pixel_variance32x8_neon, 10),
+ ObmcSubpelVarianceParams(3, 5,
+ &aom_highbd_10_obmc_sub_pixel_variance8x32_neon, 10),
+ ObmcSubpelVarianceParams(4, 2,
+ &aom_highbd_10_obmc_sub_pixel_variance16x4_neon, 10),
+ ObmcSubpelVarianceParams(2, 4,
+ &aom_highbd_10_obmc_sub_pixel_variance4x16_neon, 10),
+ ObmcSubpelVarianceParams(
+ 7, 7, &aom_highbd_8_obmc_sub_pixel_variance128x128_neon, 8),
+ ObmcSubpelVarianceParams(7, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance128x64_neon, 8),
+ ObmcSubpelVarianceParams(6, 7,
+ &aom_highbd_8_obmc_sub_pixel_variance64x128_neon, 8),
+ ObmcSubpelVarianceParams(6, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance64x64_neon, 8),
+ ObmcSubpelVarianceParams(6, 5,
+ &aom_highbd_8_obmc_sub_pixel_variance64x32_neon, 8),
+ ObmcSubpelVarianceParams(5, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance32x64_neon, 8),
+ ObmcSubpelVarianceParams(5, 5,
+ &aom_highbd_8_obmc_sub_pixel_variance32x32_neon, 8),
+ ObmcSubpelVarianceParams(5, 4,
+ &aom_highbd_8_obmc_sub_pixel_variance32x16_neon, 8),
+ ObmcSubpelVarianceParams(4, 5,
+ &aom_highbd_8_obmc_sub_pixel_variance16x32_neon, 8),
+ ObmcSubpelVarianceParams(4, 4,
+ &aom_highbd_8_obmc_sub_pixel_variance16x16_neon, 8),
+ ObmcSubpelVarianceParams(4, 3, &aom_highbd_8_obmc_sub_pixel_variance16x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 4, &aom_highbd_8_obmc_sub_pixel_variance8x16_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_8_obmc_sub_pixel_variance8x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_8_obmc_sub_pixel_variance8x4_neon,
+ 8),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_8_obmc_sub_pixel_variance4x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_8_obmc_sub_pixel_variance4x4_neon,
+ 8),
+ ObmcSubpelVarianceParams(6, 4,
+ &aom_highbd_8_obmc_sub_pixel_variance64x16_neon, 8),
+ ObmcSubpelVarianceParams(4, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance16x64_neon, 8),
+ ObmcSubpelVarianceParams(5, 3, &aom_highbd_8_obmc_sub_pixel_variance32x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 5, &aom_highbd_8_obmc_sub_pixel_variance8x32_neon,
+ 8),
+ ObmcSubpelVarianceParams(4, 2, &aom_highbd_8_obmc_sub_pixel_variance16x4_neon,
+ 8),
+ ObmcSubpelVarianceParams(2, 4, &aom_highbd_8_obmc_sub_pixel_variance4x16_neon,
+ 8),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDObmcSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_neon));
+#endif // !CONFIG_REALTIME_ONLY
+
#endif // CONFIG_AV1_HIGHBITDEPTH
#endif // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+
+const VarianceParams kArrayVariance_neon_dotprod[] = {
+ VarianceParams(7, 7, &aom_variance128x128_neon_dotprod),
+ VarianceParams(6, 6, &aom_variance64x64_neon_dotprod),
+ VarianceParams(7, 6, &aom_variance128x64_neon_dotprod),
+ VarianceParams(6, 7, &aom_variance64x128_neon_dotprod),
+ VarianceParams(6, 6, &aom_variance64x64_neon_dotprod),
+ VarianceParams(6, 5, &aom_variance64x32_neon_dotprod),
+ VarianceParams(5, 6, &aom_variance32x64_neon_dotprod),
+ VarianceParams(5, 5, &aom_variance32x32_neon_dotprod),
+ VarianceParams(5, 4, &aom_variance32x16_neon_dotprod),
+ VarianceParams(4, 5, &aom_variance16x32_neon_dotprod),
+ VarianceParams(4, 4, &aom_variance16x16_neon_dotprod),
+ VarianceParams(4, 3, &aom_variance16x8_neon_dotprod),
+ VarianceParams(3, 4, &aom_variance8x16_neon_dotprod),
+ VarianceParams(3, 3, &aom_variance8x8_neon_dotprod),
+ VarianceParams(3, 2, &aom_variance8x4_neon_dotprod),
+ VarianceParams(2, 3, &aom_variance4x8_neon_dotprod),
+ VarianceParams(2, 2, &aom_variance4x4_neon_dotprod),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(2, 4, &aom_variance4x16_neon_dotprod),
+ VarianceParams(4, 2, &aom_variance16x4_neon_dotprod),
+ VarianceParams(3, 5, &aom_variance8x32_neon_dotprod),
+ VarianceParams(5, 3, &aom_variance32x8_neon_dotprod),
+ VarianceParams(4, 6, &aom_variance16x64_neon_dotprod),
+ VarianceParams(6, 4, &aom_variance64x16_neon_dotprod),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AvxVarianceTest,
+ ::testing::ValuesIn(kArrayVariance_neon_dotprod));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_neon_dotprod[] = {
+ GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+ GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+ GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+ GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0)
+};
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, GetSseSum8x8QuadTest,
+ ::testing::ValuesIn(kArrayGetSseSum8x8Quad_neon_dotprod));
+
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_neon_dotprod[] = {
+ GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+ GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+ GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+ GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0)
+};
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, GetSseSum16x16DualTest,
+ ::testing::ValuesIn(kArrayGetSseSum16x16Dual_neon_dotprod));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AvxMseTest,
+ ::testing::Values(MseParams(3, 3, &aom_mse8x8_neon_dotprod),
+ MseParams(3, 4, &aom_mse8x16_neon_dotprod),
+ MseParams(4, 4, &aom_mse16x16_neon_dotprod),
+ MseParams(4, 3, &aom_mse16x8_neon_dotprod)));
+
+#endif // HAVE_NEON_DOTPROD
+
} // namespace
diff --git a/test/video_source.h b/test/video_source.h
index f7a8b98a5..9d73d7b25 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -125,7 +125,7 @@ class TempOutFile {
// aom_image_t images with associated timestamps and duration.
class VideoSource {
public:
- virtual ~VideoSource() {}
+ virtual ~VideoSource() = default;
// Prepare the stream for reading, rewind/open as necessary.
virtual void Begin() = 0;
@@ -160,35 +160,35 @@ class DummyVideoSource : public VideoSource {
ReallocImage();
}
- virtual ~DummyVideoSource() { aom_img_free(img_); }
+ ~DummyVideoSource() override { aom_img_free(img_); }
- virtual void Begin() {
+ void Begin() override {
frame_ = 0;
FillFrame();
}
- virtual void Next() {
+ void Next() override {
++frame_;
FillFrame();
}
- virtual aom_image_t *img() const {
+ aom_image_t *img() const override {
return (frame_ < limit_) ? img_ : nullptr;
}
// Models a stream where Timebase = 1/FPS, so pts == frame.
- virtual aom_codec_pts_t pts() const { return frame_; }
+ aom_codec_pts_t pts() const override { return frame_; }
- virtual unsigned long duration() const { return 1; }
+ unsigned long duration() const override { return 1; }
- virtual aom_rational_t timebase() const {
+ aom_rational_t timebase() const override {
const aom_rational_t t = { 1, 30 };
return t;
}
- virtual unsigned int frame() const { return frame_; }
+ unsigned int frame() const override { return frame_; }
- virtual unsigned int limit() const { return limit_; }
+ unsigned int limit() const override { return limit_; }
void set_limit(unsigned int limit) { limit_ = limit; }
@@ -234,7 +234,7 @@ class RandomVideoSource : public DummyVideoSource {
: rnd_(seed), seed_(seed) {}
// Reset the RNG to get a matching stream for the second pass
- virtual void Begin() {
+ void Begin() override {
frame_ = 0;
rnd_.Reset(seed_);
FillFrame();
@@ -243,7 +243,7 @@ class RandomVideoSource : public DummyVideoSource {
protected:
// 15 frames of noise, followed by 15 static frames. Reset to 0 rather
// than holding previous frames to encourage keyframes to be thrown.
- virtual void FillFrame() {
+ void FillFrame() override {
if (img_) {
if (frame_ % 30 < 15)
for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
@@ -260,7 +260,7 @@ class RandomVideoSource : public DummyVideoSource {
// decompressed images to the decoder.
class CompressedVideoSource {
public:
- virtual ~CompressedVideoSource() {}
+ virtual ~CompressedVideoSource() = default;
virtual void Init() = 0;
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 1d9dd4547..f0be7d226 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -33,19 +33,21 @@ INSTANTIATE_TEST_SUITE_P(
C, AV1WarpFilterTest,
libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
- SSE4_1, AV1WarpFilterTest,
- libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
-
-#if CONFIG_AV1_HIGHBITDEPTH
+#if CONFIG_AV1_HIGHBITDEPTH && (HAVE_SSE4_1 || HAVE_NEON)
TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
RunCheckOutput(std::get<4>(GET_PARAM(0)));
}
TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) {
RunSpeedTest(std::get<4>(GET_PARAM(0)));
}
+#endif // CONFIG_AV1_HIGHBITDEPTH && (HAVE_SSE4_1 || HAVE_NEON)
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
+#if CONFIG_AV1_HIGHBITDEPTH
INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdWarpFilterTest,
libaom_test::AV1HighbdWarpFilter::BuildParams(
av1_highbd_warp_affine_sse4_1));
@@ -58,7 +60,6 @@ INSTANTIATE_TEST_SUITE_P(
libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
#if CONFIG_AV1_HIGHBITDEPTH
-
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1HighbdWarpFilterTest,
libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2));
@@ -69,6 +70,24 @@ INSTANTIATE_TEST_SUITE_P(
INSTANTIATE_TEST_SUITE_P(
NEON, AV1WarpFilterTest,
libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdWarpFilterTest,
+ libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_neon));
+#endif // CONFIG_AV1_HIGHBITDEPTH
#endif // HAVE_NEON
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+ NEON_I8MM, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon_i8mm));
+#endif // HAVE_NEON_I8MM
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sve));
+#endif // HAVE_SVE
+
} // namespace
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index e42671eb3..470c98077 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -19,9 +19,14 @@ using std::tuple;
namespace libaom_test {
-int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) {
- // 1 in 8 chance of generating zero (arbitrarily chosen)
- if (((rnd->Rand8()) & 7) == 0) return 0;
+int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits,
+ int rnd_gen_zeros) {
+ // Avoid accidentally generating a zero in speed tests, they are set by the
+ // is_*_zero parameters instead.
+ if (rnd_gen_zeros) {
+ // 1 in 8 chance of generating zero (arbitrarily chosen)
+ if (((rnd->Rand8()) & 7) == 0) return 0;
+ }
// Otherwise, enerate uniform values in the range
// [-(1 << bits), 1] U [1, 1<<bits]
int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1));
@@ -33,34 +38,47 @@ void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
int16_t *alpha, int16_t *beta, int16_t *gamma,
int16_t *delta, const int is_alpha_zero,
const int is_beta_zero, const int is_gamma_zero,
- const int is_delta_zero) {
- while (1) {
+ const int is_delta_zero, const int rnd_gen_zeros) {
+ while (true) {
int rnd8 = rnd->Rand8() & 3;
- mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
- mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
- mat[2] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
- (1 << WARPEDMODEL_PREC_BITS);
- mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+ mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6, rnd_gen_zeros);
+ mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6, rnd_gen_zeros);
+ mat[2] =
+ (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
if (rnd8 <= 1) {
// AFFINE
- mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
- mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
- (1 << WARPEDMODEL_PREC_BITS);
+ mat[4] =
+ random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
+ mat[5] =
+ (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+ (1 << WARPEDMODEL_PREC_BITS);
} else if (rnd8 == 2) {
mat[4] = -mat[3];
mat[5] = mat[2];
} else {
- mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
- mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
- (1 << WARPEDMODEL_PREC_BITS);
- if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS;
- if (is_beta_zero == 1) mat[3] = 0;
- if (is_gamma_zero == 1) mat[4] = 0;
- if (is_delta_zero == 1)
- mat[5] = static_cast<int32_t>(
- ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
- (1 << WARPEDMODEL_PREC_BITS));
+ mat[4] =
+ random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
+ mat[5] =
+ (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ }
+
+ if (is_alpha_zero == 1) {
+ mat[2] = 1 << WARPEDMODEL_PREC_BITS;
+ }
+ if (is_beta_zero == 1) {
+ mat[3] = 0;
+ }
+ if (is_gamma_zero == 1) {
+ mat[4] = 0;
+ }
+ if (is_delta_zero == 1) {
+ mat[5] = static_cast<int32_t>(
+ ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
+ (1 << WARPEDMODEL_PREC_BITS));
}
// Calculate the derived parameters and check that they are suitable
@@ -109,11 +127,9 @@ namespace AV1WarpFilter {
::testing::Values(0, 1), ::testing::Values(0, 1));
}
-AV1WarpFilterTest::~AV1WarpFilterTest() {}
+AV1WarpFilterTest::~AV1WarpFilterTest() = default;
void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-void AV1WarpFilterTest::TearDown() {}
-
void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
const int w = 128, h = 128;
const int border = 16;
@@ -144,7 +160,7 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
ASSERT_NE(dsta, nullptr);
generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
is_alpha_zero, is_beta_zero, is_gamma_zero,
- is_delta_zero);
+ is_delta_zero, 0);
for (int r = 0; r < h; ++r)
for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
@@ -170,8 +186,8 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
- 1000.0 * elapsed_time / num_loops);
+ printf("warp %3dx%-3d alpha=%d beta=%d gamma=%d delta=%d: %7.2f ns \n", out_w,
+ out_h, alpha, beta, gamma, delta, 1000.0 * elapsed_time / num_loops);
}
void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
@@ -221,7 +237,7 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
for (int sub_y = 0; sub_y < 2; ++sub_y) {
generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
is_alpha_zero, is_beta_zero, is_gamma_zero,
- is_delta_zero);
+ is_delta_zero, 1);
for (int ii = 0; ii < 2; ++ii) {
for (int jj = 0; jj < 5; ++jj) {
@@ -301,13 +317,11 @@ namespace AV1HighbdWarpFilter {
::testing::Values(0, 1), ::testing::Values(0, 1));
}
-AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {}
+AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() = default;
void AV1HighbdWarpFilterTest::SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
}
-void AV1HighbdWarpFilterTest::TearDown() {}
-
void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
const int w = 128, h = 128;
const int border = 16;
@@ -339,7 +353,7 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
is_alpha_zero, is_beta_zero, is_gamma_zero,
- is_delta_zero);
+ is_delta_zero, 0);
// Generate an input block and extend its borders horizontally
for (int r = 0; r < h; ++r)
for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
@@ -367,7 +381,8 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+ printf("highbd warp %3dx%-3d alpha=%d beta=%d gamma=%d delta=%d: %7.2f ns \n",
+ out_w, out_h, alpha, beta, gamma, delta,
1000.0 * elapsed_time / num_loops);
}
@@ -422,7 +437,7 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
for (int sub_y = 0; sub_y < 2; ++sub_y) {
generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
is_alpha_zero, is_beta_zero, is_gamma_zero,
- is_delta_zero);
+ is_delta_zero, 1);
for (int ii = 0; ii < 2; ++ii) {
for (int jj = 0; jj < 5; ++jj) {
for (int do_average = 0; do_average <= 1; ++do_average) {
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index 583f31282..364368ac0 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h
@@ -50,10 +50,8 @@ typedef std::tuple<WarpTestParam, int, int, int, int> WarpTestParams;
class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParams> {
public:
- virtual ~AV1WarpFilterTest();
- virtual void SetUp();
-
- virtual void TearDown();
+ ~AV1WarpFilterTest() override;
+ void SetUp() override;
protected:
void RunCheckOutput(warp_affine_func test_impl);
@@ -86,10 +84,8 @@ typedef std::tuple<HighbdWarpTestParam, int, int, int, int>
class AV1HighbdWarpFilterTest
: public ::testing::TestWithParam<HighbdWarpTestParams> {
public:
- virtual ~AV1HighbdWarpFilterTest();
- virtual void SetUp();
-
- virtual void TearDown();
+ ~AV1HighbdWarpFilterTest() override;
+ void SetUp() override;
protected:
void RunCheckOutput(highbd_warp_affine_func test_impl);
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 706e59697..845abd6dc 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -30,19 +30,19 @@ class WebMVideoSource : public CompressedVideoSource {
webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0),
frame_sz_(0), frame_number_(0), end_of_file_(false) {}
- virtual ~WebMVideoSource() {
+ ~WebMVideoSource() override {
if (aom_ctx_->file != nullptr) fclose(aom_ctx_->file);
webm_free(webm_ctx_);
delete aom_ctx_;
delete webm_ctx_;
}
- virtual void Init() {
+ void Init() override {
ASSERT_NE(aom_ctx_, nullptr);
ASSERT_NE(webm_ctx_, nullptr);
}
- virtual void Begin() {
+ void Begin() override {
ASSERT_NE(aom_ctx_, nullptr);
ASSERT_NE(webm_ctx_, nullptr);
aom_ctx_->file = OpenTestDataFile(file_name_);
@@ -54,7 +54,7 @@ class WebMVideoSource : public CompressedVideoSource {
FillFrame();
}
- virtual void Next() {
+ void Next() override {
++frame_number_;
FillFrame();
}
@@ -85,11 +85,11 @@ class WebMVideoSource : public CompressedVideoSource {
} while (!webm_ctx_->is_key_frame && !end_of_file_);
}
- virtual const uint8_t *cxdata() const {
+ const uint8_t *cxdata() const override {
return end_of_file_ ? nullptr : buf_;
}
- virtual size_t frame_size() const { return frame_sz_; }
- virtual unsigned int frame_number() const { return frame_number_; }
+ size_t frame_size() const override { return frame_sz_; }
+ unsigned int frame_number() const override { return frame_number_; }
protected:
std::string file_name_;
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 8be6a64ed..7eb6372aa 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -190,7 +190,7 @@ typedef std::tuple<const compute_stats_Func> WienerTestParam;
class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
src_buf = (uint8_t *)aom_memalign(
32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
ASSERT_NE(src_buf, nullptr);
@@ -204,7 +204,7 @@ class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
memset(buf, 0, buf_size);
target_func_ = GET_PARAM(0);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_buf);
aom_free(dgd_buf);
aom_free(buf);
@@ -322,9 +322,11 @@ void WienerTest::RunWienerTest_ExtremeValues(const int32_t wiener_win) {
buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ // Fill with alternating extreme values to maximize difference with
+ // the average.
for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
- dgd_buf[i] = 255;
- src_buf[i] = 255;
+ dgd_buf[i] = i & 1 ? 255 : 0;
+ src_buf[i] = i & 1 ? 255 : 0;
}
uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
uint8_t *src = src_buf;
@@ -389,6 +391,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, WienerTest,
::testing::Values(av1_compute_stats_avx2));
#endif // HAVE_AVX2
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, WienerTest,
+ ::testing::Values(av1_compute_stats_neon));
+#endif // HAVE_NEON
+
} // namespace wiener_lowbd
#if CONFIG_AV1_HIGHBITDEPTH
@@ -531,7 +539,7 @@ typedef std::tuple<const compute_stats_Func> WienerTestParam;
class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
public:
- virtual void SetUp() {
+ void SetUp() override {
src_buf = (uint16_t *)aom_memalign(
32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
ASSERT_NE(src_buf, nullptr);
@@ -540,7 +548,7 @@ class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
ASSERT_NE(dgd_buf, nullptr);
target_func_ = GET_PARAM(0);
}
- virtual void TearDown() {
+ void TearDown() override {
aom_free(src_buf);
aom_free(dgd_buf);
}
@@ -650,9 +658,11 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win,
const int src_stride = MAX_DATA_BLOCK;
const int iters = 1;
for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ // Fill with alternating extreme values to maximize difference with
+ // the average.
for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
- dgd_buf[i] = ((uint16_t)1 << bit_depth) - 1;
- src_buf[i] = ((uint16_t)1 << bit_depth) - 1;
+ dgd_buf[i] = i & 1 ? ((uint16_t)1 << bit_depth) - 1 : 0;
+ src_buf[i] = i & 1 ? ((uint16_t)1 << bit_depth) - 1 : 0;
}
const uint8_t *dgd8 = CONVERT_TO_BYTEPTR(
dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
@@ -728,6 +738,11 @@ INSTANTIATE_TEST_SUITE_P(AVX2, WienerTestHighbd,
::testing::Values(av1_compute_stats_highbd_avx2));
#endif // HAVE_AVX2
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerTestHighbd,
+ ::testing::Values(av1_compute_stats_highbd_neon));
+#endif // HAVE_NEON
+
// A test that reproduces b/274668506: signed integer overflow in
// update_a_sep_sym().
TEST(SearchWienerTest, 10bitSignedIntegerOverflowInUpdateASepSym) {
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 515a78381..a4ed13f7c 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -66,7 +66,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
protected:
Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
- virtual ~Y4mVideoSourceTest() { CloseSource(); }
+ ~Y4mVideoSourceTest() override { CloseSource(); }
virtual void Init(const std::string &file_name, int limit) {
file_name_ = file_name;
@@ -128,7 +128,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
protected:
Y4mVideoWriteTest() : tmpfile_(nullptr) {}
- virtual ~Y4mVideoWriteTest() {
+ ~Y4mVideoWriteTest() override {
delete tmpfile_;
input_file_ = nullptr;
}
@@ -162,7 +162,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
ReplaceInputFile(tmpfile_->file());
}
- virtual void Init(const std::string &file_name, int limit) {
+ void Init(const std::string &file_name, int limit) override {
Y4mVideoSourceTest::Init(file_name, limit);
WriteY4mAndReadBack();
}
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index bf657761f..1369e4e28 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -28,7 +28,7 @@ class Y4mVideoSource : public VideoSource {
start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
framerate_denominator_(0), y4m_() {}
- virtual ~Y4mVideoSource() {
+ ~Y4mVideoSource() override {
aom_img_free(img_.get());
CloseSource();
}
@@ -53,33 +53,33 @@ class Y4mVideoSource : public VideoSource {
FillFrame();
}
- virtual void Begin() {
+ void Begin() override {
OpenSource();
ReadSourceToStart();
}
- virtual void Next() {
+ void Next() override {
++frame_;
FillFrame();
}
- virtual aom_image_t *img() const {
+ aom_image_t *img() const override {
return (frame_ < limit_) ? img_.get() : nullptr;
}
// Models a stream where Timebase = 1/FPS, so pts == frame.
- virtual aom_codec_pts_t pts() const { return frame_; }
+ aom_codec_pts_t pts() const override { return frame_; }
- virtual unsigned long duration() const { return 1; }
+ unsigned long duration() const override { return 1; }
- virtual aom_rational_t timebase() const {
+ aom_rational_t timebase() const override {
const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
return t;
}
- virtual unsigned int frame() const { return frame_; }
+ unsigned int frame() const override { return frame_; }
- virtual unsigned int limit() const { return limit_; }
+ unsigned int limit() const override { return limit_; }
virtual void FillFrame() {
ASSERT_NE(input_file_, nullptr);
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 1b898b541..77d5dfa73 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -36,12 +36,12 @@ class YUVVideoSource : public VideoSource {
SetSize(width, height, format);
}
- virtual ~YUVVideoSource() {
+ ~YUVVideoSource() override {
aom_img_free(img_);
if (input_file_) fclose(input_file_);
}
- virtual void Begin() {
+ void Begin() override {
if (input_file_) fclose(input_file_);
input_file_ = OpenTestDataFile(file_name_);
ASSERT_NE(input_file_, nullptr)
@@ -53,28 +53,28 @@ class YUVVideoSource : public VideoSource {
FillFrame();
}
- virtual void Next() {
+ void Next() override {
++frame_;
FillFrame();
}
- virtual aom_image_t *img() const {
+ aom_image_t *img() const override {
return (frame_ < limit_) ? img_ : nullptr;
}
// Models a stream where Timebase = 1/FPS, so pts == frame.
- virtual aom_codec_pts_t pts() const { return frame_; }
+ aom_codec_pts_t pts() const override { return frame_; }
- virtual unsigned long duration() const { return 1; }
+ unsigned long duration() const override { return 1; }
- virtual aom_rational_t timebase() const {
+ aom_rational_t timebase() const override {
const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
return t;
}
- virtual unsigned int frame() const { return frame_; }
+ unsigned int frame() const override { return frame_; }
- virtual unsigned int limit() const { return limit_; }
+ unsigned int limit() const override { return limit_; }
virtual void SetSize(unsigned int width, unsigned int height,
aom_img_fmt format) {
diff --git a/third_party/fastfeat/README.libaom b/third_party/fastfeat/README.libaom
index 8aaee1299..556d8b674 100644
--- a/third_party/fastfeat/README.libaom
+++ b/third_party/fastfeat/README.libaom
@@ -41,3 +41,4 @@ Prefix global functions with "aom_"
Add error checking
Add output argument to hold the scores of the detected features
Add assertion and rewrite comparisons to appease the scan-build static analyzer
+Set output argument *ret_num_corners to -1 to signal memory allocation failure
diff --git a/third_party/fastfeat/fast.c b/third_party/fastfeat/fast.c
index a684a3320..c475b4c7e 100644
--- a/third_party/fastfeat/fast.c
+++ b/third_party/fastfeat/fast.c
@@ -42,7 +42,21 @@ xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, in
xy* nonmax;
corners = aom_fast9_detect(im, xsize, ysize, stride, b, &num_corners);
+ if(!corners)
+ {
+ // Memory allocation failure
+ *ret_num_corners = -1;
+ return NULL;
+ }
+ // num_corners may be zero.
scores = aom_fast9_score(im, stride, corners, num_corners, b);
+ if(!scores && num_corners > 0)
+ {
+ // Memory allocation failure
+ free(corners);
+ *ret_num_corners = -1;
+ return NULL;
+ }
nonmax = aom_nonmax_suppression(corners, scores, num_corners, ret_scores, ret_num_corners);
free(corners);
diff --git a/third_party/fastfeat/fast.h b/third_party/fastfeat/fast.h
index 7fd199f8c..228ba85ad 100644
--- a/third_party/fastfeat/fast.h
+++ b/third_party/fastfeat/fast.h
@@ -37,10 +37,14 @@ typedef unsigned char byte;
int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart);
+// Returns NULL on memory allocation failure.
xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
-int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b);
+// If num_corners > 0, returns NULL on memory allocation failure.
+int* aom_fast9_score(const byte* i, int stride, const xy* corners, int num_corners, int b);
+// Sets *ret_num_corners to -1 (and returns NULL) on memory allocation failure.
+// Sets *ret_num_corners to 0 if nothing went wrong but no corners were found.
xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b,
int** ret_scores, int* ret_num_corners);
diff --git a/third_party/fastfeat/fast_9.c b/third_party/fastfeat/fast_9.c
index 345c37fed..de55ab51f 100644
--- a/third_party/fastfeat/fast_9.c
+++ b/third_party/fastfeat/fast_9.c
@@ -31,9 +31,7 @@
// clang-format off
/*This is mechanically generated code*/
#include <stdlib.h>
-
-typedef struct { int x, y; } xy;
-typedef unsigned char byte;
+#include "fast.h"
int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart)
{
@@ -2988,7 +2986,7 @@ static void make_offsets(int pixel[], int row_stride)
-int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b)
+int* aom_fast9_score(const byte* i, int stride, const xy* corners, int num_corners, int b)
{
int* scores = (int*)malloc(sizeof(int)* num_corners);
int n;
@@ -5927,8 +5925,13 @@ xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, in
if(num_corners == rsize)
{
rsize*=2;
- ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
- if(!ret_corners) return NULL;
+ xy* new_ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
+ if(!new_ret_corners)
+ {
+ free(ret_corners);
+ return NULL;
+ }
+ ret_corners = new_ret_corners;
}
ret_corners[num_corners].x = x;
ret_corners[num_corners].y = y;
diff --git a/third_party/fastfeat/nonmax.c b/third_party/fastfeat/nonmax.c
index cc0ada7a0..a6f7da031 100644
--- a/third_party/fastfeat/nonmax.c
+++ b/third_party/fastfeat/nonmax.c
@@ -53,9 +53,10 @@ xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners
int point_below = 0;
*ret_scores = 0;
- *ret_num_nonmax = 0;
+ *ret_num_nonmax = -1;
if(!(corners && scores) || num_corners < 1)
{
+ *ret_num_nonmax = 0;
return 0;
}